Skip to content

Commit

Permalink
fix: the cosine similarity is evaluated for top comments and bot comm…
Browse files Browse the repository at this point in the history
…ents are ignored
  • Loading branch information
gentlementlegen committed Dec 26, 2024
1 parent 3f2fea1 commit c3d091b
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 3 deletions.
Binary file modified bun.lockb
Binary file not shown.
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
"js-tiktoken": "1.0.15",
"jsdom": "24.0.0",
"markdown-it": "14.1.0",
"natural": "^8.0.1",
"openai": "4.56.0",
"yaml": "^2.6.1"
},
Expand Down
59 changes: 59 additions & 0 deletions src/helpers/tf-idf.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import natural from "natural";
import { AllComments } from "../types/content-evaluator-module-type";

export class TfIdf {
private _tfidf: natural.TfIdf;

constructor() {
this._tfidf = new natural.TfIdf();
}

private _preprocessText(text: string): string {
return text
.toLowerCase()
.replace(/[^\w\s]/g, " ")
.replace(/\s+/g, " ")
.trim();
}

public calculateSimilarity(text1: string, text2: string): number {
this._tfidf = new natural.TfIdf();
const processed1 = this._preprocessText(text1);
const processed2 = this._preprocessText(text2);

this._tfidf.addDocument(processed1);
this._tfidf.addDocument(processed2);

const vector1 = this._tfidf.listTerms(0);
const vector2 = this._tfidf.listTerms(1);

const terms = new Set([...vector1.map((v) => v.term), ...vector2.map((v) => v.term)]);

const v1: number[] = [];
const v2: number[] = [];

terms.forEach((term) => {
const term1 = vector1.find((v) => v.term === term);
const term2 = vector2.find((v) => v.term === term);
v1.push(term1 ? term1.tfidf : 0);
v2.push(term2 ? term2.tfidf : 0);
});

const dotProduct = v1.reduce((sum, val, i) => sum + val * v2[i], 0);
const magnitude1 = Math.sqrt(v1.reduce((sum, val) => sum + val * val, 0));
const magnitude2 = Math.sqrt(v2.reduce((sum, val) => sum + val * val, 0));

if (magnitude1 === 0 || magnitude2 === 0) return 0;

return dotProduct / (magnitude1 * magnitude2);
}

getTopComments(specification: string, comments: AllComments, limit = 10) {
return comments
.map((comment) => {
return { similarity: this.calculateSimilarity(specification, comment.comment), comment };
})
.sort((a, b) => b.similarity - a.similarity)
.slice(0, limit);
}
}
27 changes: 24 additions & 3 deletions src/parser/content-evaluator-module.ts
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ import {
import { BaseModule } from "../types/module";
import { ContextPlugin } from "../types/plugin-input";
import { GithubCommentScore, Result } from "../types/results";
import { TfIdf } from "../helpers/tf-idf";

const TOKEN_MODEL_LIMIT = 124000;

/**
* Evaluates and rates comments.
Expand Down Expand Up @@ -61,7 +64,7 @@ export class ContentEvaluatorModule extends BaseModule {
const allCommentsUnClean = data.allComments || [];
const allComments: { id: number; comment: string; author: string }[] = [];
for (const commentObj of allCommentsUnClean) {
if (commentObj.user) {
if (commentObj.user && commentObj.user.type !== "Bot") {
allComments.push({ id: commentObj.id, comment: commentObj.body ?? "", author: commentObj.user.login });
}
}
Expand Down Expand Up @@ -178,15 +181,33 @@ export class ContentEvaluatorModule extends BaseModule {
const dummyResponse = JSON.stringify(this._generateDummyResponse(comments), null, 2);
const maxTokens = this._calculateMaxTokens(dummyResponse);

const promptForComments = this._generatePromptForComments(specification, comments, allComments);
let promptForComments = this._generatePromptForComments(specification, comments, allComments);
if (this._calculateMaxTokens(promptForComments, Infinity) > TOKEN_MODEL_LIMIT) {
const tfidf = new TfIdf();
const mostImportantComments = tfidf.getTopComments(specification, allComments);
promptForComments = this._generatePromptForComments(
specification,
comments,
mostImportantComments.map((o) => o.comment)
);
}
commentRelevances = await this._submitPrompt(promptForComments, maxTokens);
}

if (prComments.length) {
const dummyResponse = JSON.stringify(this._generateDummyResponse(prComments), null, 2);
const maxTokens = this._calculateMaxTokens(dummyResponse);

const promptForPrComments = this._generatePromptForPrComments(specification, prComments);
let promptForPrComments = this._generatePromptForPrComments(specification, prComments);
if (this._calculateMaxTokens(promptForPrComments, Infinity) > TOKEN_MODEL_LIMIT) {
const tfidf = new TfIdf();
const mostImportantComments = tfidf.getTopComments(specification, allComments);
promptForPrComments = this._generatePromptForComments(
specification,
comments,
mostImportantComments.map((o) => o.comment)
);
}
prCommentRelevances = await this._submitPrompt(promptForPrComments, maxTokens);
}

Expand Down

0 comments on commit c3d091b

Please sign in to comment.