From 732b01015b668ea41eea5ddfbe9cb107f663b669 Mon Sep 17 00:00:00 2001 From: Hritwik Tripathi <52003051+3scava1i3r@users.noreply.github.com> Date: Fri, 27 Dec 2024 11:54:19 +0000 Subject: [PATCH 1/5] feat: relevance for img and a tags --- .env.example | 5 +- src/parser/data-purge-module.ts | 156 +++++++++++++++++++++++++++++++- 2 files changed, 159 insertions(+), 2 deletions(-) diff --git a/.env.example b/.env.example index 2611122d..63dffa92 100644 --- a/.env.example +++ b/.env.example @@ -35,4 +35,7 @@ PERMIT_ERC20_TOKENS_NO_FEE_WHITELIST="" KERNEL_PUBLIC_KEY="" # Logger level, default is INFO -LOG_LEVEL="" \ No newline at end of file +LOG_LEVEL="" + +# Huggingface API key +HUGGINGFACE_API_KEY="" \ No newline at end of file diff --git a/src/parser/data-purge-module.ts b/src/parser/data-purge-module.ts index 09e1c0ac..ddfaa968 100644 --- a/src/parser/data-purge-module.ts +++ b/src/parser/data-purge-module.ts @@ -44,6 +44,158 @@ export class DataPurgeModule extends BaseModule { return false; } + + async _generateImageDescription(imageUrl: string): Promise { + try { + // Fetch image data from URL + const imageResponse = await fetch(imageUrl); + const imageData = await imageResponse.arrayBuffer(); + + // Send to HuggingFace API + const response = await fetch( + "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large", + { + headers: { + Authorization: `Bearer ${this.context.env.HUGGINGFACE_API_KEY}`, + "Content-Type": "application/json", + }, + method: "POST", + body: Buffer.from(imageData) + } + ); + + const result = await response.json(); + return result[0]?.generated_text || null; + } catch (error) { + this.context.logger.error(`Failed to generate image description: ${error}`); + return null; + } + } + + async _generateChatResponse(userMessage: string): Promise { + try { + // Define the Hugging Face API endpoint + const url = + "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3/v1/chat/completions"; + + // Construct the payload + const payload = { + model: "mistralai/Mistral-7B-Instruct-v0.3", + messages: [ + { + role: "user", + content: userMessage, + }, + ], + max_tokens: 500, + stream: false, + }; + + // Send request to Hugging Face API + const response = await fetch(url, { + headers: { + Authorization: `Bearer ${this.context.env.HUGGINGFACE_API_KEY}`, + "Content-Type": "application/json", + }, + method: "POST", + body: JSON.stringify(payload), + }); + + // Parse the response + const result = await response.json(); + return result.choices?.[0]?.message?.content || null; + } catch (error) { + this.context.logger.error(`Failed to generate chat response: ${error}`); + return null; + } + } + + + + async _generateLinkDescription(linkUrl: string): Promise { + try { + // Fetch the content of the link + const linkResponse = await fetch(linkUrl); + const contentType = linkResponse.headers.get('content-type'); + + // Only process text/html or text/plain content + if (!contentType || (!contentType.includes('text/html') && !contentType.includes('text/plain'))) { + this.context.logger.info(`Skipping non-HTML content: ${contentType}, ${linkUrl}`); + return null; + } + + const linkData = await linkResponse.text(); + const cleanText = linkData + .replace(/)<[^<]*)*<\/script>/gi, '') // Remove scripts + .replace(/)<[^<]*)*<\/style>/gi, '') // Remove styles + .replace(/<[^>]+>/g, ' ') // Remove HTML tags + .replace(/\s+/g, ' ') // Normalize whitespace + .replace(/{\s*"props".*$/s, '') // Remove JSON data + .trim(); + + + const generatedTextDescription = await this._generateChatResponse('Summarize the following webpage code into a concise and easy-to-understand text explanation of one paragraph with no bullet points. Focus on describing the purpose, structure, and functionality of the code, including key elements such as layout, styles, scripts, and any interactive features. Avoid technical jargon unless necessary'+cleanText); + + return generatedTextDescription; + } catch (error) { + this.context.logger.error(`Failed to generate link description: ${error}`); + return null; + } + } + + private async _processCommentBody(commentBody: string): Promise { + + // Extract image URL from Markdown or HTML image tags + const imageMatch = commentBody.match(/!\[.*?\]\((.*?)\)/) || commentBody.match(/src="([^"]*)"/); + const imageUrl = imageMatch ? imageMatch[1] : null; + + if (imageUrl) { + const description = await this._generateImageDescription(imageUrl); + if (description) { + this.context.logger.info(`Generated description: ${description}`); + + // Update the commentBody by replacing alt with description + const updatedContent = commentBody + // Replace Markdown-style images with HTML tags and set description attribute + .replace(/!\[(.*?)\]\((.*?)\)/g, `${description}`) + // Replace the alt attribute with the description variable's value + .replace(/alt="[^"]*"/, `alt="${description}"`); + + return updatedContent; + } + } + + return commentBody; + } + + private async _processCommentBodyLink(commentBody: string): Promise { + + const linkRegex = /\[([^\]]+)\]\(([^)]+)\)|]+href="([^"]+)"[^>]*>|https?:\/\/[^\s<)]+/g; + const links = [...commentBody.matchAll(linkRegex)] + .map(match => match[2] || match[3] || match[0]) + .map(url => url.replace(/[?"]/g, '')); // Clean up URLs by removing ? and " characters + + let updatedContent = commentBody; + + + for (const link of links) { + const description = await this._generateLinkDescription(link); + if (description) { + const linkResponse = await fetch(link); + const contentType = linkResponse.headers.get('content-type'); + + if (contentType && (contentType.includes('text/html') || contentType.includes('text/plain'))) { + updatedContent = commentBody.replace( + new RegExp(link, 'g'), + `${link}` + ); + } + } + } + return updatedContent; + } + + async transform(data: Readonly, result: Result) { this._assignmentPeriods = await getAssignmentPeriods( this.context.octokit, @@ -54,7 +206,9 @@ export class DataPurgeModule extends BaseModule { continue; } if (comment.body && comment.user?.login && result[comment.user.login]) { - const newContent = comment.body + const processedCommentBody = await this._processCommentBody(comment.body); + const processedCommentBodyLink = await this._processCommentBodyLink(processedCommentBody); + const newContent = processedCommentBodyLink // Remove quoted text .replace(/^>.*$/gm, "") // Remove commands such as /start From cc37ede442d0211b263745c8604291efbc1c9f10 Mon Sep 17 00:00:00 2001 From: Hritwik Tripathi <52003051+3scava1i3r@users.noreply.github.com> Date: Fri, 27 Dec 2024 12:05:32 +0000 Subject: [PATCH 2/5] fix:eslint on data-purge --- src/parser/data-purge-module.ts | 124 +++++++++++++++----------------- 1 file changed, 59 insertions(+), 65 deletions(-) diff --git a/src/parser/data-purge-module.ts b/src/parser/data-purge-module.ts index ddfaa968..e92abb5a 100644 --- a/src/parser/data-purge-module.ts +++ b/src/parser/data-purge-module.ts @@ -44,13 +44,12 @@ export class DataPurgeModule extends BaseModule { return false; } - async _generateImageDescription(imageUrl: string): Promise { try { // Fetch image data from URL const imageResponse = await fetch(imageUrl); const imageData = await imageResponse.arrayBuffer(); - + // Send to HuggingFace API const response = await fetch( "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large", @@ -60,23 +59,22 @@ export class DataPurgeModule extends BaseModule { "Content-Type": "application/json", }, method: "POST", - body: Buffer.from(imageData) + body: Buffer.from(imageData), } ); - + const result = await response.json(); return result[0]?.generated_text || null; } catch (error) { this.context.logger.error(`Failed to generate image description: ${error}`); return null; } - } + } async _generateChatResponse(userMessage: string): Promise { try { // Define the Hugging Face API endpoint - const url = - "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3/v1/chat/completions"; + const url = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3/v1/chat/completions"; // Construct the payload const payload = { @@ -110,92 +108,88 @@ export class DataPurgeModule extends BaseModule { } } + async _generateLinkDescription(linkUrl: string): Promise { + try { + // Fetch the content of the link + const linkResponse = await fetch(linkUrl); + const contentType = linkResponse.headers.get("content-type"); - - async _generateLinkDescription(linkUrl: string): Promise { - try { - // Fetch the content of the link - const linkResponse = await fetch(linkUrl); - const contentType = linkResponse.headers.get('content-type'); - - // Only process text/html or text/plain content - if (!contentType || (!contentType.includes('text/html') && !contentType.includes('text/plain'))) { - this.context.logger.info(`Skipping non-HTML content: ${contentType}, ${linkUrl}`); - return null; - } - - const linkData = await linkResponse.text(); - const cleanText = linkData - .replace(/)<[^<]*)*<\/script>/gi, '') // Remove scripts - .replace(/)<[^<]*)*<\/style>/gi, '') // Remove styles - .replace(/<[^>]+>/g, ' ') // Remove HTML tags - .replace(/\s+/g, ' ') // Normalize whitespace - .replace(/{\s*"props".*$/s, '') // Remove JSON data - .trim(); - - - const generatedTextDescription = await this._generateChatResponse('Summarize the following webpage code into a concise and easy-to-understand text explanation of one paragraph with no bullet points. Focus on describing the purpose, structure, and functionality of the code, including key elements such as layout, styles, scripts, and any interactive features. Avoid technical jargon unless necessary'+cleanText); - - return generatedTextDescription; - } catch (error) { - this.context.logger.error(`Failed to generate link description: ${error}`); + // Only process text/html or text/plain content + if (!contentType || (!contentType.includes("text/html") && !contentType.includes("text/plain"))) { + this.context.logger.info(`Skipping non-HTML content: ${contentType}, ${linkUrl}`); return null; } - } - private async _processCommentBody(commentBody: string): Promise { + const linkData = await linkResponse.text(); + const cleanText = linkData + .replace(/)<[^<]*)*<\/script>/gi, "") // Remove scripts + .replace(/)<[^<]*)*<\/style>/gi, "") // Remove styles + .replace(/<[^>]+>/g, " ") // Remove HTML tags + .replace(/\s+/g, " ") // Normalize whitespace + .replace(/{\s*"props".*$/s, "") // Remove JSON data + .trim(); + + const generatedTextDescription = await this._generateChatResponse( + "Summarize the following webpage code into a concise and easy-to-understand text explanation of one paragraph with no bullet points. Focus on describing the purpose, structure, and functionality of the code, including key elements such as layout, styles, scripts, and any interactive features. Avoid technical jargon unless necessary" + + cleanText + ); - // Extract image URL from Markdown or HTML image tags - const imageMatch = commentBody.match(/!\[.*?\]\((.*?)\)/) || commentBody.match(/src="([^"]*)"/); - const imageUrl = imageMatch ? imageMatch[1] : null; + return generatedTextDescription; + } catch (error) { + this.context.logger.error(`Failed to generate link description: ${error}`); + return null; + } + } - if (imageUrl) { - const description = await this._generateImageDescription(imageUrl); - if (description) { - this.context.logger.info(`Generated description: ${description}`); + private async _processCommentBody(commentBody: string): Promise { + // Extract image URL from Markdown or HTML image tags + const imageMatch = commentBody.match(/!\[.*?\]\((.*?)\)/) || commentBody.match(/src="([^"]*)"/); + const imageUrl = imageMatch ? imageMatch[1] : null; - // Update the commentBody by replacing alt with description - const updatedContent = commentBody - // Replace Markdown-style images with HTML tags and set description attribute - .replace(/!\[(.*?)\]\((.*?)\)/g, `${description}`) - // Replace the alt attribute with the description variable's value - .replace(/alt="[^"]*"/, `alt="${description}"`); + if (imageUrl) { + const description = await this._generateImageDescription(imageUrl); + if (description) { + this.context.logger.info(`Generated description: ${description}`); - return updatedContent; - } + // Update the commentBody by replacing alt with description + const updatedContent = commentBody + // Replace Markdown-style images with HTML tags and set description attribute + .replace(/!\[(.*?)\]\((.*?)\)/g, `${description}`) + // Replace the alt attribute with the description variable's value + .replace(/alt="[^"]*"/, `alt="${description}"`); + + return updatedContent; } + } - return commentBody; + return commentBody; } private async _processCommentBodyLink(commentBody: string): Promise { - - const linkRegex = /\[([^\]]+)\]\(([^)]+)\)|]+href="([^"]+)"[^>]*>|https?:\/\/[^\s<)]+/g; + const linkRegex = /\[([^\]]+)\]\(([^)]+)\)|]*href="([^"]+)"|https?:\/\/\S+/g; const links = [...commentBody.matchAll(linkRegex)] - .map(match => match[2] || match[3] || match[0]) - .map(url => url.replace(/[?"]/g, '')); // Clean up URLs by removing ? and " characters - - let updatedContent = commentBody; + .map((match) => match[2] || match[3] || match[0]) + .map((url) => url.replace(/[?"]/g, "")); // Clean up URLs by removing ? and " characters + let updatedContent = commentBody; for (const link of links) { const description = await this._generateLinkDescription(link); if (description) { const linkResponse = await fetch(link); - const contentType = linkResponse.headers.get('content-type'); - - if (contentType && (contentType.includes('text/html') || contentType.includes('text/plain'))) { + const contentType = linkResponse.headers.get("content-type"); + + if (contentType && (contentType.includes("text/html") || contentType.includes("text/plain"))) { updatedContent = commentBody.replace( - new RegExp(link, 'g'), + new RegExp(link, "g"), `${link}` ); } - } + } } return updatedContent; } - async transform(data: Readonly, result: Result) { this._assignmentPeriods = await getAssignmentPeriods( this.context.octokit, From 9cb2f4f6412e53f8d257abe9cdf113df8d1750cc Mon Sep 17 00:00:00 2001 From: Hritwik Tripathi <52003051+3scava1i3r@users.noreply.github.com> Date: Fri, 27 Dec 2024 12:11:13 +0000 Subject: [PATCH 3/5] fix:cspell --- .cspell.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.cspell.json b/.cspell.json index e1d8982f..67d72553 100644 --- a/.cspell.json +++ b/.cspell.json @@ -31,7 +31,9 @@ "Rpcs", "sonarjs", "pico", - "timespan" + "timespan", + "HUGGINGFACE", + "mistralai" ], "dictionaries": ["typescript", "node", "software-terms"], "import": [ From 59a413908d93f960ea54a9f2800b39b27b16b0af Mon Sep 17 00:00:00 2001 From: Hritwik Tripathi <52003051+3scava1i3r@users.noreply.github.com> Date: Fri, 27 Dec 2024 12:15:25 +0000 Subject: [PATCH 4/5] fix: env type addition --- src/types/env-type.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/src/types/env-type.ts b/src/types/env-type.ts index 8dc31683..6f202327 100644 --- a/src/types/env-type.ts +++ b/src/types/env-type.ts @@ -14,6 +14,7 @@ const envConfigSchema = Type.Object({ PERMIT_ERC20_TOKENS_NO_FEE_WHITELIST: Type.String(), KERNEL_PUBLIC_KEY: Type.Optional(Type.String()), LOG_LEVEL: Type.Enum(LOG_LEVEL, { default: LOG_LEVEL.INFO }), + HUGGINGFACE_API_KEY: Type.String(), }); export type EnvConfig = Static; From 8ee06995c9336eaca89a9919685d358e1602a0bf Mon Sep 17 00:00:00 2001 From: Hritwik Tripathi <52003051+3scava1i3r@users.noreply.github.com> Date: Wed, 8 Jan 2025 13:21:14 +0000 Subject: [PATCH 5/5] fix: changed logic from huggingface to openrouter --- .cspell.json | 1 - .env.example | 3 - src/configuration/data-purge-config.ts | 25 ++++++ src/parser/data-purge-module.ts | 102 ++++++++++++------------- src/types/env-type.ts | 1 - 5 files changed, 73 insertions(+), 59 deletions(-) diff --git a/.cspell.json b/.cspell.json index 67d72553..4eab5c11 100644 --- a/.cspell.json +++ b/.cspell.json @@ -32,7 +32,6 @@ "sonarjs", "pico", "timespan", - "HUGGINGFACE", "mistralai" ], "dictionaries": ["typescript", "node", "software-terms"], diff --git a/.env.example b/.env.example index 63dffa92..bcd1a94e 100644 --- a/.env.example +++ b/.env.example @@ -36,6 +36,3 @@ KERNEL_PUBLIC_KEY="" # Logger level, default is INFO LOG_LEVEL="" - -# Huggingface API key -HUGGINGFACE_API_KEY="" \ No newline at end of file diff --git a/src/configuration/data-purge-config.ts b/src/configuration/data-purge-config.ts index f5331453..02b29d86 100644 --- a/src/configuration/data-purge-config.ts +++ b/src/configuration/data-purge-config.ts @@ -1,5 +1,29 @@ import { Type, Static } from "@sinclair/typebox"; +const openAiType = Type.Object( + { + /** + * AI model to use for comment evaluation. + */ + model: Type.String({ + default: "gpt-4o-2024-08-06", + description: "OpenAI model, e.g. gpt-4o", + examples: ["gpt-4o"], + }), + /** + * Specific endpoint to send the comments to. + */ + endpoint: Type.String({ + default: "https://api.openai.com/v1", + pattern: /^(https?:\/\/[^\s$.?#].\S*)$/i.source, + description: "OpenAI endpoint for requests", + examples: ["https://api.openai.com/v1"], + }), + }, + { default: {} } +); + + export const dataPurgeConfigurationType = Type.Object({ skipCommentsWhileAssigned: Type.Union([Type.Literal("all"), Type.Literal("exact"), Type.Literal("none")], { default: "all", @@ -10,6 +34,7 @@ export const dataPurgeConfigurationType = Type.Object({ "- 'none': Includes all comments, regardless of assignment status or timing.", examples: ["all", "exact", "none"], }), + openAi: openAiType, }); export type DataPurgeConfiguration = Static; diff --git a/src/parser/data-purge-module.ts b/src/parser/data-purge-module.ts index e92abb5a..1e200648 100644 --- a/src/parser/data-purge-module.ts +++ b/src/parser/data-purge-module.ts @@ -5,6 +5,7 @@ import { IssueActivity } from "../issue-activity"; import { parseGitHubUrl } from "../start"; import { BaseModule } from "../types/module"; import { Result } from "../types/results"; +import OpenAI from 'openai'; /** * Removes the data in the comments that we do not want to be processed. @@ -13,6 +14,11 @@ export class DataPurgeModule extends BaseModule { readonly _configuration: DataPurgeConfiguration | null = this.context.config.incentives.dataPurge; _assignmentPeriods: UserAssignments = {}; + readonly _openAi = new OpenAI({ + apiKey: this.context.env.OPENAI_API_KEY, + ...(this._configuration?.openAi.endpoint && { baseURL: this._configuration.openAi.endpoint }), + }); + get enabled(): boolean { if (!this._configuration) { this.context.logger.error("Invalid / missing configuration detected for DataPurgeModule, disabling."); @@ -44,27 +50,28 @@ export class DataPurgeModule extends BaseModule { return false; } + + async _generateImageDescription(imageUrl: string): Promise { try { - // Fetch image data from URL const imageResponse = await fetch(imageUrl); const imageData = await imageResponse.arrayBuffer(); - - // Send to HuggingFace API - const response = await fetch( - "https://api-inference.huggingface.co/models/Salesforce/blip-image-captioning-large", - { - headers: { - Authorization: `Bearer ${this.context.env.HUGGINGFACE_API_KEY}`, - "Content-Type": "application/json", - }, - method: "POST", - body: Buffer.from(imageData), - } - ); - - const result = await response.json(); - return result[0]?.generated_text || null; + const base64Image = Buffer.from(imageData).toString('base64'); + const response = await this._openAi.chat.completions.create({ + model: "chatgpt-4o-latest", + messages: [ + { + role: "user", + content: [ + { type: "text", text: "Describe this image concisely in one paragraph." }, + { type: "image_url", image_url: { url: `data:image/jpeg;base64,${base64Image}` } } + ] + } + ], + max_tokens: 300 + }); + + return response.choices[0]?.message?.content || null; } catch (error) { this.context.logger.error(`Failed to generate image description: ${error}`); return null; @@ -73,35 +80,18 @@ export class DataPurgeModule extends BaseModule { async _generateChatResponse(userMessage: string): Promise { try { - // Define the Hugging Face API endpoint - const url = "https://api-inference.huggingface.co/models/mistralai/Mistral-7B-Instruct-v0.3/v1/chat/completions"; - - // Construct the payload - const payload = { - model: "mistralai/Mistral-7B-Instruct-v0.3", + const response = await this._openAi.chat.completions.create({ + model: "gpt-4o-2024-08-06", messages: [ { role: "user", - content: userMessage, - }, + content: userMessage + } ], - max_tokens: 500, - stream: false, - }; - - // Send request to Hugging Face API - const response = await fetch(url, { - headers: { - Authorization: `Bearer ${this.context.env.HUGGINGFACE_API_KEY}`, - "Content-Type": "application/json", - }, - method: "POST", - body: JSON.stringify(payload), + max_tokens: 500 }); - // Parse the response - const result = await response.json(); - return result.choices?.[0]?.message?.content || null; + return response.choices[0]?.message?.content || null; } catch (error) { this.context.logger.error(`Failed to generate chat response: ${error}`); return null; @@ -110,31 +100,35 @@ export class DataPurgeModule extends BaseModule { async _generateLinkDescription(linkUrl: string): Promise { try { - // Fetch the content of the link const linkResponse = await fetch(linkUrl); - const contentType = linkResponse.headers.get("content-type"); + const contentType = linkResponse.headers.get('content-type'); - // Only process text/html or text/plain content - if (!contentType || (!contentType.includes("text/html") && !contentType.includes("text/plain"))) { + if (!contentType || (!contentType.includes('text/html') && !contentType.includes('text/plain'))) { this.context.logger.info(`Skipping non-HTML content: ${contentType}, ${linkUrl}`); return null; } const linkData = await linkResponse.text(); const cleanText = linkData - .replace(/)<[^<]*)*<\/script>/gi, "") // Remove scripts - .replace(/)<[^<]*)*<\/style>/gi, "") // Remove styles - .replace(/<[^>]+>/g, " ") // Remove HTML tags - .replace(/\s+/g, " ") // Normalize whitespace - .replace(/{\s*"props".*$/s, "") // Remove JSON data + .replace(/)<[^<]*)*<\/script>/gi, '') + .replace(/)<[^<]*)*<\/style>/gi, '') + .replace(/<[^>]+>/g, ' ') + .replace(/\s+/g, ' ') + .replace(/{\s*"props".*$/s, '') .trim(); - const generatedTextDescription = await this._generateChatResponse( - "Summarize the following webpage code into a concise and easy-to-understand text explanation of one paragraph with no bullet points. Focus on describing the purpose, structure, and functionality of the code, including key elements such as layout, styles, scripts, and any interactive features. Avoid technical jargon unless necessary" + - cleanText - ); + const response = await this._openAi.chat.completions.create({ + model: "gpt-4o-2024-08-06", + messages: [ + { + role: "user", + content: `Summarize the following webpage code into a concise and easy-to-understand text explanation of one paragraph with no bullet points. Focus on describing the purpose, structure, and functionality of the code, including key elements such as layout, styles, scripts, and any interactive features. Avoid technical jargon unless necessary: ${cleanText}` + } + ], + max_tokens: 500 + }); - return generatedTextDescription; + return response.choices[0]?.message?.content || null; } catch (error) { this.context.logger.error(`Failed to generate link description: ${error}`); return null; diff --git a/src/types/env-type.ts b/src/types/env-type.ts index 6f202327..8dc31683 100644 --- a/src/types/env-type.ts +++ b/src/types/env-type.ts @@ -14,7 +14,6 @@ const envConfigSchema = Type.Object({ PERMIT_ERC20_TOKENS_NO_FEE_WHITELIST: Type.String(), KERNEL_PUBLIC_KEY: Type.Optional(Type.String()), LOG_LEVEL: Type.Enum(LOG_LEVEL, { default: LOG_LEVEL.INFO }), - HUGGINGFACE_API_KEY: Type.String(), }); export type EnvConfig = Static;