diff --git a/bun.lockb b/bun.lockb index 70d7c166..2726d4fe 100755 Binary files a/bun.lockb and b/bun.lockb differ diff --git a/package.json b/package.json index b8294252..e0c250b1 100644 --- a/package.json +++ b/package.json @@ -41,6 +41,7 @@ "js-tiktoken": "1.0.15", "jsdom": "24.0.0", "markdown-it": "14.1.0", + "natural": "^8.0.1", "openai": "4.56.0", "yaml": "^2.6.1" }, diff --git a/src/helpers/tf-idf.ts b/src/helpers/tf-idf.ts new file mode 100644 index 00000000..eb44c771 --- /dev/null +++ b/src/helpers/tf-idf.ts @@ -0,0 +1,59 @@ +import natural from "natural"; +import { AllComments } from "../types/content-evaluator-module-type"; + +export class TfIdf { + private _tfidf: natural.TfIdf; + + constructor() { + this._tfidf = new natural.TfIdf(); + } + + private _preprocessText(text: string): string { + return text + .toLowerCase() + .replace(/[^\w\s]/g, " ") + .replace(/\s+/g, " ") + .trim(); + } + + public calculateSimilarity(text1: string, text2: string): number { + this._tfidf = new natural.TfIdf(); + const processed1 = this._preprocessText(text1); + const processed2 = this._preprocessText(text2); + + this._tfidf.addDocument(processed1); + this._tfidf.addDocument(processed2); + + const vector1 = this._tfidf.listTerms(0); + const vector2 = this._tfidf.listTerms(1); + + const terms = new Set([...vector1.map((v) => v.term), ...vector2.map((v) => v.term)]); + + const v1: number[] = []; + const v2: number[] = []; + + terms.forEach((term) => { + const term1 = vector1.find((v) => v.term === term); + const term2 = vector2.find((v) => v.term === term); + v1.push(term1 ? term1.tfidf : 0); + v2.push(term2 ? term2.tfidf : 0); + }); + + const dotProduct = v1.reduce((sum, val, i) => sum + val * v2[i], 0); + const magnitude1 = Math.sqrt(v1.reduce((sum, val) => sum + val * val, 0)); + const magnitude2 = Math.sqrt(v2.reduce((sum, val) => sum + val * val, 0)); + + if (magnitude1 === 0 || magnitude2 === 0) return 0; + + return dotProduct / (magnitude1 * magnitude2); + } + + getTopComments(specification: string, comments: AllComments, limit = 10) { + return comments + .map((comment) => { + return { similarity: this.calculateSimilarity(specification, comment.comment), comment }; + }) + .sort((a, b) => b.similarity - a.similarity) + .slice(0, limit); + } +} diff --git a/src/parser/content-evaluator-module.ts b/src/parser/content-evaluator-module.ts index 70393aa5..147f27a0 100644 --- a/src/parser/content-evaluator-module.ts +++ b/src/parser/content-evaluator-module.ts @@ -15,6 +15,9 @@ import { import { BaseModule } from "../types/module"; import { ContextPlugin } from "../types/plugin-input"; import { GithubCommentScore, Result } from "../types/results"; +import { TfIdf } from "../helpers/tf-idf"; + +const TOKEN_MODEL_LIMIT = 124000; /** * Evaluates and rates comments. @@ -61,7 +64,7 @@ export class ContentEvaluatorModule extends BaseModule { const allCommentsUnClean = data.allComments || []; const allComments: { id: number; comment: string; author: string }[] = []; for (const commentObj of allCommentsUnClean) { - if (commentObj.user) { + if (commentObj.user && commentObj.user.type !== "Bot") { allComments.push({ id: commentObj.id, comment: commentObj.body ?? "", author: commentObj.user.login }); } } @@ -178,7 +181,16 @@ export class ContentEvaluatorModule extends BaseModule { const dummyResponse = JSON.stringify(this._generateDummyResponse(comments), null, 2); const maxTokens = this._calculateMaxTokens(dummyResponse); - const promptForComments = this._generatePromptForComments(specification, comments, allComments); + let promptForComments = this._generatePromptForComments(specification, comments, allComments); + if (this._calculateMaxTokens(promptForComments, Infinity) > TOKEN_MODEL_LIMIT) { + const tfidf = new TfIdf(); + const mostImportantComments = tfidf.getTopComments(specification, allComments); + promptForComments = this._generatePromptForComments( + specification, + comments, + mostImportantComments.map((o) => o.comment) + ); + } commentRelevances = await this._submitPrompt(promptForComments, maxTokens); } @@ -186,7 +198,16 @@ export class ContentEvaluatorModule extends BaseModule { const dummyResponse = JSON.stringify(this._generateDummyResponse(prComments), null, 2); const maxTokens = this._calculateMaxTokens(dummyResponse); - const promptForPrComments = this._generatePromptForPrComments(specification, prComments); + let promptForPrComments = this._generatePromptForPrComments(specification, prComments); + if (this._calculateMaxTokens(promptForPrComments, Infinity) > TOKEN_MODEL_LIMIT) { + const tfidf = new TfIdf(); + const mostImportantComments = tfidf.getTopComments(specification, allComments); + promptForPrComments = this._generatePromptForComments( + specification, + comments, + mostImportantComments.map((o) => o.comment) + ); + } prCommentRelevances = await this._submitPrompt(promptForPrComments, maxTokens); }