diff --git a/frontend/src/components/Chatbot.js b/frontend/src/components/Chatbot.js index 331af5e..ecf3f92 100644 --- a/frontend/src/components/Chatbot.js +++ b/frontend/src/components/Chatbot.js @@ -17,7 +17,7 @@ function Chatbot() {
- +
diff --git a/frontend/src/interface.js b/frontend/src/interface.js index 2c9f6e0..21a6ed3 100644 --- a/frontend/src/interface.js +++ b/frontend/src/interface.js @@ -67,6 +67,19 @@ export async function sendMessage() { queryButton.disabled = false; } +// Send most relevant files to ChatGPT API +export async function sendFiles(files, userInput) { + let request = ""; + + for (let i = 0; i < files.length; i++) { + request = request.concat(files[i] + "\n"); + } + + request = request.concat(userInput); + + await fetchChatGPTResponse(request); +} + // Create a query from the user input that will be used to find the most relevant files export async function sendCodebaseQuery() { const userInput = document.getElementById("user-input").value; @@ -157,5 +170,6 @@ export async function fetchPineconeResponse(userInput) { appendMessage("Error", botMessage.error); } else { appendMessage("Assistant", botMessage.text); + sendFiles(botMessage.files, userInput); } } diff --git a/server/config/pineconeConfig/embeddingConfig.js b/server/config/pineconeConfig/embeddingConfig.js deleted file mode 100644 index 7b3f281..0000000 --- a/server/config/pineconeConfig/embeddingConfig.js +++ /dev/null @@ -1,35 +0,0 @@ -const openai = require('../openaiConfig') -require('dotenv').config(); - -// Function to generate embeddings using OpenAI's API -async function generateEmbeddings(tokens, fileType) { - try { - // Flatten the tokens into a format suitable for OpenAI - - let text = "placeholder"; - - if (fileType == "JSON") { - text = tokens.map(token => JSON.stringify(token)).join('\n'); - } - else if (fileType == "string") { - text = tokens; - } - - // Request embeddings - const response = await openai.embeddings.create({ - model: 'text-embedding-ada-002', // Use an appropriate embedding model - input: text, - encoding_format: 'float' - }); - - console.log('Embedding Dimension: ', response.data[0].embedding.length); - console.log('OpenAI embeddings response:', response.data); - return response.data; - - - } catch (error) { - console.error('Error generating embeddings with OpenAI:', error); - } -} - -module.exports = generateEmbeddings; diff --git a/server/config/pineconeConfig/pineconeManager.js b/server/config/pineconeConfig/pineconeManager.js index 395fbc0..3fc7333 100644 --- a/server/config/pineconeConfig/pineconeManager.js +++ b/server/config/pineconeConfig/pineconeManager.js @@ -26,6 +26,16 @@ class PineconeManager { this.index = this.pc.index(indexName); } + /** + * Creates a delay for a specified amount of time. + * + * @param {number} ms - The delay time in milliseconds. + * @returns {Promise} A promise that resolves after the specified time. + */ + delay(ms) { + return new Promise(resolve => setTimeout(resolve, ms)); + } + /** * Initializes the Pinecone index with the specified configuration. * Creates the index on the Pinecone server if it does not already exist. @@ -45,57 +55,70 @@ class PineconeManager { }, }, }); + this.index = this.pc.index(this.indexName); // Reinitialize the index after creation + await this.delay(3000); // 3 second delay } /** * Upserts embeddings into the specified namespace of the Pinecone index. * * @async - * @param {number[]} embeddings - The embeddings vector to upsert. - * @param {string} [namespace="SampleCode"] - The namespace in the index to upsert to. - * @param {string} [id="SampleCode"] - The unique ID for the vector. + * @param {Object} data - The dictionary of functions and classes with embeddings. + * @param {string} [namespace="codebase"] - The namespace in the index to upsert to. * @returns {Promise} A promise that resolves once the embeddings are upserted. */ - async upsertEmbeddings(embeddings, namespace, id) { - await this.index.namespace(namespace).upsert([ - { - id: id, - values: embeddings, - }, - ]); - } + async upsertEmbeddings(data, namespace = "codebase") { + // Prepare the upsert request payload + const upsertPayload = []; - /** - * Retrieves and logs the statistics of the Pinecone index. - * - * @async - * @returns {Promise} A promise that resolves once the index stats are logged. - */ - async checkIndex() { - const stats = await this.index.describeIndexStats(); - console.log(stats); + // Handle functions + data.functions.forEach((func) => { + if (func.embedding && Array.isArray(func.embedding)) { + upsertPayload.push({ + id: func.function_name, + values: func.embedding, + metadata: { filepath: func.filepath, type: 'function' } + }); + } + }); + + // Handle classes + data.classes.forEach((cls) => { + if (cls.embedding && Array.isArray(cls.embedding)) { + upsertPayload.push({ + id: cls.class_name, + values: cls.embedding, + metadata: { filepath: cls.filepath, type: 'class' } + }); + } + }); + + // Upsert the data into Pinecone + await this.index.namespace(namespace).upsert(upsertPayload); + await this.delay(3000); // 3 second delay + console.log('Embeddings upserted successfully.'); } /** - * Performs a similarity search within the specified namespace of the Pinecone index. - * Logs the search results to the console. + * Queries the Pinecone index using the provided embedding. * * @async - * @param {number[]} vector - The query vector for the similarity search. - * @param {number} [topK=3] - The number of top results to return. - * @param {string} [namespace="ns1"] - The namespace in the index to search within. - * @param {boolean} [includeValues=true] - Whether to include vector values in the results. - * @returns {Promise} A promise that resolves once the search results are logged. - * @returns {JSON} A data structure giving the top k results. + * @param {Array} embedding - The embedding vector to query with. + * @param {string} [namespace="samplecode"] - The namespace to query. + * @param {number} [topK=5] - The number of top results to return. + * @returns {Promise} A promise that resolves to the query results. */ - async similaritySearch(vector, topK = 3, namespace, includeValues = true) { + async similaritySearch(embedding, namespace = "codebase", topK = 3) { const queryResponse = await this.index.namespace(namespace).query({ - topK, - vector, - includeValues, + vector: embedding, + topK: topK, // Number of top results to return + includeValues: true, + includeMetadata: true // Include metadata in the response }); - console.log(queryResponse); + + console.log(queryResponse.matches); + return queryResponse; } @@ -109,31 +132,16 @@ class PineconeManager { await this.pc.deleteIndex(this.indexName); } - /** + /** * Deletes the vectors in a specified namespace. * * @async * @param {string} [namespace="ns1"] - The namespace in the index to search within. * @returns {Promise} A promise that resolves once all vectors in a namespace are deleted. */ - async deleteVectorsFromNamespace(namespace) { - await this.index.namespace('codebase').deleteAll(); + async deleteVectorsFromNamespace(namespace) { + await this.index.namespace(namespace).deleteAll(); } } -module.exports = PineconeManager; - - - -/* -Example Usage -const pineconeManager = new PineconeManager(process.env.PINECONE_API_KEY, "SampleCode-Upsert"); - -(async () => { - await pineconeManager.initPinecone(); - await pineconeManager.upsertEmbeddings([1.0, 2.0, 3.0]); - await pineconeManager.checkIndex(); - await pineconeManager.similaritySearch([1.0, 1.5]); - await pineconeManager.clearIndex(); -})(); - */ \ No newline at end of file +module.exports = PineconeManager; \ No newline at end of file diff --git a/server/controllers/pineconeQuery.js b/server/controllers/pineconeQuery.js index e92e14e..3db14e4 100644 --- a/server/controllers/pineconeQuery.js +++ b/server/controllers/pineconeQuery.js @@ -1,7 +1,8 @@ const pinecone = require("../config/pineconeConfig/pineconeInit"); -const generateEmbeddings = require("../config/pineconeConfig/embeddingConfig"); +const {generateEmbeddings, processAndUpdateDictionary} = require("../database/embeddingService"); const fs = require("fs"); const path = require("path"); +const readCodeFromFile = require('../database/readCodeFromFile'); const getPineconeResponse = async (req, res) => { const userInput = req.body.prompt; @@ -15,6 +16,8 @@ const getPineconeResponse = async (req, res) => { return res.json({ text: "You haven't uploaded a codebase yet! Please try again." }); } + shortPath = "../codebases/"; + if (!userInput) { return res.status(400).json({ error: "Input is required" }); } @@ -22,32 +25,38 @@ const getPineconeResponse = async (req, res) => { if (!process.env.PINECONE_API_KEY) { return res.status(500).json({ error: "API key is missing" }); } - + const input = [userInput]; - const embeddingResponse = await generateEmbeddings(input, "string"); - const embed = embeddingResponse[0].embedding; - + const embed = await generateEmbeddings(input); try { - let files = await pinecone.similaritySearch(embed, 3, "codebase", true); // Using default values - let answer = "The most relevant files to your query were "; + let files = await pinecone.similaritySearch(embed); // Using default values + // console.log(files); + let answer = "The most relevant code chunks to your query are "; + + const filesToSend = []; for (let i = 0; i < files.matches.length; i++) { if (files.matches.length == 0) { answer = "No files relevant to your query could be found."; } else if (files.matches.length == 1) { - answer = `The most relevant file to your query is ${files.matches[i].id.substring(codebasePath.length+1)} with a score of ${files.matches[i].score}.`; + answer = `The most relevant file to your query is the ${files.matches[i].metadata.type} \`\`\`${files.matches[i].id}\`\`\` (from \`\`\`${files.matches[i].metadata.filepath.substring(shortPath.length)}\`\`\`) with a score of ${files.matches[i].score}.`; } else if (i == files.matches.length-1) { - answer = answer.concat(`and ${files.matches[i].id.substring(codebasePath.length+1)} with a score of ${files.matches[i].score}.`); + answer = answer.concat(`and the ${files.matches[i].metadata.type} \`\`\`${files.matches[i].id}\`\`\` (from \`\`\`${files.matches[i].metadata.filepath.substring(shortPath.length)}\`\`\`) with a score of ${files.matches[i].score}.`); } else { - answer = answer.concat(`${files.matches[i].id.substring(codebasePath.length+1)} with a score of ${files.matches[i].score}, `); + answer = answer.concat(`the ${files.matches[i].metadata.type} \`\`\`${files.matches[i].id}\`\`\` (from \`\`\`${files.matches[i].metadata.filepath.substring(shortPath.length)}\`\`\`) with a score of ${files.matches[i].score}, `); + } + const code = await readCodeFromFile(codebasePath.concat(files.matches[i].metadata.filepath.substring(shortPath.length-1))); + if (!filesToSend.includes(code)) { + filesToSend.push(code); } } - res.json({ text: answer }); + console.log(filesToSend); + res.json({ text: answer, files: filesToSend }); } catch (error) { console.error("Error querying user input: ", error); diff --git a/server/database/embeddingService.js b/server/database/embeddingService.js new file mode 100644 index 0000000..f2a09a9 --- /dev/null +++ b/server/database/embeddingService.js @@ -0,0 +1,55 @@ +const openai = require('../config/openaiConfig'); + +// Function to generate embeddings using OpenAI's API +async function generateEmbeddings(text) { + /* // Initialize OpenAI client + const openai = new OpenAI({ + apiKey: process.env.OPENAI_API_KEY, // Ensure your API key is set in the environment variables + }); */ + + try { + // Flatten the tokens into a format suitable for OpenAI + // const text = tokens.map(token => JSON.stringify(token)).join('\n'); + + // Request embeddings + const response = await openai.embeddings.create({ + model: 'text-embedding-ada-002', // Use an appropriate embedding model + input: text, + encoding_format: 'float' + }); + + /* console.log('Embedding Dimension: ', response.data[0].embedding.length); + console.log('OpenAI embeddings response:', response.data); */ + // return response.data + return response.data[0].embedding; + + } catch (error) { + console.error('Error generating embeddings with OpenAI:', error); + } +} + +// Main function to process and update the dictionary +async function processAndUpdateDictionary(dict) { + for (const func of dict.functions) { + const embedding = await generateEmbeddings(func.code); + if (embedding) { + func.embedding = embedding; + } + } + + for (const cls of dict.classes) { + const embedding = await generateEmbeddings(cls.code); + if (embedding) { + cls.embedding = embedding; + } + } + + // console.log('Updated dictionary with embeddings:', JSON.stringify(dict, null, 2)) + // console.log(dict); + + + + return dict; +} + +module.exports = {processAndUpdateDictionary, generateEmbeddings}; diff --git a/server/database/extractCodeBlocks.js b/server/database/extractCodeBlocks.js new file mode 100644 index 0000000..891ea79 --- /dev/null +++ b/server/database/extractCodeBlocks.js @@ -0,0 +1,170 @@ +const fs = require('fs'); +const path = require('path'); + +// Prefixes for detecting function and class definitions +const FUNC_PREFIXES = ['function ', 'const ', 'let ', 'var ']; +const CLASS_PREFIX = 'class '; +const NEWLINE = '\n'; + +/** + * Extracts the function name from a line of JavaScript code. + * + * Handles function declarations, arrow functions, and variable declarations for functions. + * + * @param {string} line - The line of code to analyze. + * @returns {string|null} - The name of the function if found, otherwise null. + */ +function getFunctionName(line) { + // Split the line by common delimiters and assume the function name is the second element + const parts = line.split(/\s|\(|\)|{|}|\[|\]|;|,/).filter(part => part.length > 0); + return parts[1] || null; // Extract function name +} + +/** + * Extracts the class name from a line of JavaScript code. + * + * Handles class definitions. + * + * @param {string} line - The line of code to analyze. + * @returns {string|null} - The name of the class if found, otherwise null. + */ +function getClassName(line) { + // Split the line by common delimiters and assume the class name is the second element + const parts = line.split(/\s|{|}/).filter(part => part.length > 0); + return parts[1] || null; // Extract class name +} + +/** + * Collects all lines of code related to a function or class definition. + * + * This function collects lines until a line that is not part of the function or class is encountered. + * + * @param {string[]} allLines - The list of lines in the code file. + * @param {number} i - The index of the starting line for the function or class. + * @returns {string} - A single string containing the full code block for the function or class. + */ +function getUntilNoSpace(allLines, i) { + const ret = [allLines[i]]; + for (let j = i + 1; j < allLines.length; j++) { + const line = allLines[j]; + // Include lines that are part of the function or class definition + if (line.trim().length === 0 || /[\s\t\)\{\}]/.test(line[0])) { + ret.push(line); + } else { + break; + } + } + return ret.join(NEWLINE); +} + +/** + * Checks if the given line starts a comment or documentation block. + * + * @param {string} line - The line of code to check. + * @param {boolean} inMultiLineComment - Flag indicating if currently within a multi-line comment. + * @param {boolean} inDocBlock - Flag indicating if currently within a documentation block. + * @returns {boolean} - True if the line is part of a comment or documentation block, otherwise false. + */ +function isCommentOrDocBlock(line, inMultiLineComment, inDocBlock) { + if (inMultiLineComment) { + // If currently inside a multi-line comment, check for comment end + return line.includes('*/'); + } else if (inDocBlock) { + // If currently inside a documentation block, check for doc block end + return line.includes('*/'); + } else { + // Check for comment or doc block start + return line.trim().startsWith('//') || line.trim().startsWith('/*') || line.trim().startsWith('/**'); + } +} + +/** + * Extracts functions, classes, and miscellaneous code from a JavaScript file. + * + * Reads the file content, identifies functions and classes using basic string manipulation, + * and collects their code and names. Collects any other code under the 'misc' key, excluding comments + * and documentation blocks. + * + * @param {string} filepath - The path to the JavaScript file. + * @returns {object} - A dictionary containing lists of functions, classes, and miscellaneous code. + * Each list contains dictionaries with 'code', 'function_name' or 'class_name', and 'filepath' keys. + */ +function extractCodeElements(filepath) { + const fileContent = fs.readFileSync(filepath, 'utf8'); + const allLines = fileContent.split(NEWLINE); + const functions = []; + const classes = []; + const misc = []; + + // Compute the relative path from the current working directory + const relativeFilePath = path.relative(process.cwd(), filepath); + + let inMultiLineComment = false; + let inDocBlock = false; + + for (let i = 0; i < allLines.length; i++) { + let line = allLines[i]; + let trimmedLine = line.trim(); + + if (isCommentOrDocBlock(trimmedLine, inMultiLineComment, inDocBlock)) { + if (trimmedLine.startsWith('/*')) { + inMultiLineComment = true; + if (trimmedLine.endsWith('*/')) { + inMultiLineComment = false; + } + } else if (trimmedLine.startsWith('/**')) { + inDocBlock = true; + if (trimmedLine.endsWith('*/')) { + inDocBlock = false; + } + } else if (trimmedLine.startsWith('//')) { + continue; // Skip single-line comments + } + continue; // Skip comments and documentation blocks + } + + let isFunction = FUNC_PREFIXES.some(prefix => trimmedLine.startsWith(prefix)) || trimmedLine.includes('=>'); + let isClass = trimmedLine.startsWith(CLASS_PREFIX); + + if (isFunction) { + const code = getUntilNoSpace(allLines, i); + const functionName = getFunctionName(trimmedLine); + if (functionName) { + functions.push({ + code, + function_name: functionName, + filepath: relativeFilePath + }); + } + i += code.split(NEWLINE).length - 1; // Skip lines already processed for functions + } else if (isClass) { + const code = getUntilNoSpace(allLines, i); + const className = getClassName(trimmedLine); + if (className) { + classes.push({ + code, + class_name: className, + filepath: relativeFilePath + }); + } + i += code.split(NEWLINE).length - 1; // Skip lines already processed for classes + } else { + // Collecting miscellaneous code, excluding comments and documentation blocks + if (trimmedLine.length > 0) { + misc.push({ + code: trimmedLine, + filepath: relativeFilePath + }); + } + } + } + + return { + functions, + classes, + // misc + relativeFilePath + }; +} + +module.exports = extractCodeElements; \ No newline at end of file diff --git a/server/database/processFile.js b/server/database/processFile.js index de8a6f5..d6bba60 100644 --- a/server/database/processFile.js +++ b/server/database/processFile.js @@ -2,33 +2,29 @@ const Parser = require('tree-sitter'); const JavaScript = require('tree-sitter-javascript'); const readCodeFromFile = require('./readCodeFromFile'); const extractTokens = require('./tokenizerService'); -const generateEmbeddings = require('../config/pineconeConfig/embeddingConfig'); +const extractCodeBlocks = require('./extractCodeBlocks'); +const {generateEmbeddings, processAndUpdateDictionary} = require('./embeddingService'); require('dotenv').config(); const pinecone = require('../config/pineconeConfig/pineconeInit'); // Main function to process the file and generate tokens async function processFile(filePath) { try { - // Read the code from the file - const code = await readCodeFromFile(filePath); + console.log(filePath); - // Initialize the Tree-sitter parser - const parser = new Parser(); - parser.setLanguage(JavaScript); + // Extract code blocks + const codeBlocks = extractCodeBlocks(filePath); + // console.log(codeBlocks); - // Parse the code and generate the syntax tree - const tree = parser.parse(code); + // Generate embeddings + const embeddedCodeBlocks = await processAndUpdateDictionary(codeBlocks); + console.log(embeddedCodeBlocks); + // writeToCsv(codeBlocks, './output.csv'); - // Extract important tokens from the syntax tree - const tokens = extractTokens(tree); + // Upsert the embeddings into Pinecone + await pinecone.upsertEmbeddings(embeddedCodeBlocks); + console.log('Embeddings upserted to Pinecone.'); - // Output the tokens - console.log('Extracted tokens:',tokens); - - const embeddingResponse = await generateEmbeddings(tokens, "JSON"); - - await pinecone.upsertEmbeddings(embeddingResponse[0].embedding, "codebase", `${filePath}`); - // Optionally check the index stats // await pinecone.checkIndex();