From 13424e42b1e159fb4a92cfbe361e4ca71079e754 Mon Sep 17 00:00:00 2001 From: Shivaditya Shivganesh Date: Sat, 4 Jan 2025 11:34:09 -0500 Subject: [PATCH] fix: timestamp based issue scraping --- functions/issue-scraper.ts | 131 +++++++++++++++---------------------- 1 file changed, 54 insertions(+), 77 deletions(-) diff --git a/functions/issue-scraper.ts b/functions/issue-scraper.ts index b10e7af..0904f32 100644 --- a/functions/issue-scraper.ts +++ b/functions/issue-scraper.ts @@ -105,13 +105,7 @@ export async function onRequest(ctx: Context): Promise { try { const supabase = new SupabaseClient(env.SUPABASE_URL, env.SUPABASE_KEY); - const response = await issueScraper( - githubUserName, - supabase, - env.VOYAGEAI_API_KEY, - result.authToken, - timestamp - ); + const response = await issueScraper(githubUserName, supabase, env.VOYAGEAI_API_KEY, result.authToken, timestamp); return new Response(response, { headers: corsHeaders, status: 200, @@ -192,19 +186,13 @@ const SEARCH_ISSUES_QUERY = ` } `; -async function fetchUserIssuesBatch( - octokit: InstanceType, - username: string, - lastScraped?: number -): Promise { +async function fetchUserIssuesBatch(octokit: InstanceType, username: string, lastScraped?: number): Promise { const allIssues: IssueNode[] = []; let hasNextPage = true; let cursor: string | null = null; // Construct the query with the lastScraped timestamp - const searchText = `assignee:${username} is:issue is:closed reason:completed ${ - lastScraped ? `closed:>${new Date(lastScraped).toISOString()}` : "" - }`; + const searchText = `assignee:${username} is:issue is:closed reason:completed ${lastScraped ? `closed:>${new Date(lastScraped).toISOString()}` : ""}`; while (hasNextPage) { const variables: { searchText: string; after?: string } = { searchText }; @@ -212,10 +200,7 @@ async function fetchUserIssuesBatch( variables.after = cursor; } - const response: GraphQlSearchResponse = await octokit.graphql( - SEARCH_ISSUES_QUERY, - variables - ); + const response: GraphQlSearchResponse = await octokit.graphql(SEARCH_ISSUES_QUERY, variables); allIssues.push(...response.search.nodes); @@ -227,65 +212,59 @@ async function fetchUserIssuesBatch( } async function batchEmbeddings(voyageClient: VoyageAIClient, texts: string[]): Promise<(number[] | undefined)[]> { - try { - const embeddingResponse = await voyageClient.embed({ - input: texts, - model: "voyage-large-2-instruct", - inputType: "document", - }); - return embeddingResponse.data?.map((item) => item.embedding) || []; - } catch (error) { - console.error("Error batching embeddings:", error); - throw error; - } + try { + const embeddingResponse = await voyageClient.embed({ + input: texts, + model: "voyage-large-2-instruct", + inputType: "document", + }); + return embeddingResponse.data?.map((item) => item.embedding) || []; + } catch (error) { + console.error("Error batching embeddings:", error); + throw error; } - - async function batchUpsertIssues( - supabase: SupabaseClient, - issues: Array<{ - id: string; - markdown: string; - plaintext: string; - embedding: string; - author_id: number; - payload: PayloadType; - }> - ): Promise { - const { error } = await supabase.from("issues").upsert(issues); - if (error) { - throw new Error(`Error during batch upsert: ${error.message}`); - } +} + +async function batchUpsertIssues( + supabase: SupabaseClient, + issues: Array<{ + id: string; + markdown: string; + plaintext: string; + embedding: string; + author_id: number; + payload: PayloadType; + }> +): Promise { + const { error } = await supabase.from("issues").upsert(issues); + if (error) { + throw new Error(`Error during batch upsert: ${error.message}`); } - - async function batchFetchAuthorIds(octokit: InstanceType, logins: string[]): Promise> { - const authorIdMap: Record = {}; - const BATCH_SIZE = 20; - for (let i = 0; i < logins.length; i += BATCH_SIZE) { - const batch = logins.slice(i, i + BATCH_SIZE); - const promises = batch.map(async (login) => { - try { - const response = await octokit.rest.users.getByUsername({ username: login }); - return { login, id: response.data.id }; - } catch (error) { - console.error(`Error fetching author ID for ${login}:`, error); - return { login, id: -1 }; - } - }); - const results = await Promise.all(promises); - results.forEach(({ login, id }) => { - authorIdMap[login] = id; - }); - } - return authorIdMap; +} + +async function batchFetchAuthorIds(octokit: InstanceType, logins: string[]): Promise> { + const authorIdMap: Record = {}; + const BATCH_SIZE = 20; + for (let i = 0; i < logins.length; i += BATCH_SIZE) { + const batch = logins.slice(i, i + BATCH_SIZE); + const promises = batch.map(async (login) => { + try { + const response = await octokit.rest.users.getByUsername({ username: login }); + return { login, id: response.data.id }; + } catch (error) { + console.error(`Error fetching author ID for ${login}:`, error); + return { login, id: -1 }; + } + }); + const results = await Promise.all(promises); + results.forEach(({ login, id }) => { + authorIdMap[login] = id; + }); } + return authorIdMap; +} -async function issueScraper( - username: string, - supabase: SupabaseClient, - voyageApiKey: string, - token?: string, - timestamp?: number -): Promise { +async function issueScraper(username: string, supabase: SupabaseClient, voyageApiKey: string, token?: string, timestamp?: number): Promise { try { if (!username) { throw new Error("Username is required"); @@ -296,9 +275,7 @@ async function issueScraper( const issues = await fetchUserIssuesBatch(octokit, username, timestamp); - const uniqueAuthors = Array.from( - new Set(issues.map((issue) => issue.author?.login).filter((login): login is string => !!login)) - ); + const uniqueAuthors = Array.from(new Set(issues.map((issue) => issue.author?.login).filter((login): login is string => !!login))); const authorIdMap = await batchFetchAuthorIds(octokit, uniqueAuthors);