Skip to content

Commit

Permalink
fix: adds scraper logic and token handling
Browse files Browse the repository at this point in the history
  • Loading branch information
sshivaditya committed Dec 27, 2024
1 parent d7f21f5 commit 9f22bfb
Show file tree
Hide file tree
Showing 5 changed files with 341 additions and 1 deletion.
1 change: 1 addition & 0 deletions build/esbuild-build.ts
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ export const esBuildContext: esbuild.BuildOptions = {
SUPABASE_STORAGE_KEY: generateSupabaseStorageKey(),
GIT_REVISION: execSync(`git rev-parse --short HEAD`).toString().trim(),
NODE_ENV: process.env.NODE_ENV || "development",
VOYAGEAI_API_KEY: process.env.VOYAGEAI_API_KEY,
}),
};

Expand Down
Binary file modified bun.lockb
Binary file not shown.
4 changes: 4 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,14 @@
"@octokit/request-error": "^6.1.0",
"@octokit/rest": "^20.0.2",
"@supabase/supabase-js": "^2.39.0",
"@types/markdown-it": "^14.1.2",
"dotenv": "^16.3.1",
"esbuild-plugin-env": "^1.0.8",
"markdown-it": "^14.1.0",
"markdown-it-plain-text": "^0.3.0",
"marked": "^11.0.0",
"marked-footnote": "^1.2.4",
"voyageai": "^0.0.3-1",
"wrangler": "^3.83.0"
},
"devDependencies": {
Expand Down
7 changes: 6 additions & 1 deletion src/home/authentication.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ import { getGitHubUser } from "./getters/get-github-user";
import { GitHubUser } from "./github-types";
import { trackReferralCode } from "./register-referral";
import { displayGitHubUserInformation } from "./rendering/display-github-user-information";
import { renderGitHubLoginButton } from "./rendering/render-github-login-button";
import { getSupabase, renderGitHubLoginButton } from "./rendering/render-github-login-button";
import { issueScraper } from "./scraper/issue-scraper";

export async function authentication() {
if (!navigator.onLine) {
Expand All @@ -20,5 +21,9 @@ export async function authentication() {
if (gitHubUser) {
await trackReferralCode();
await displayGitHubUserInformation(gitHubUser);
// <-- Issue Scraper here -->
const supabase = getSupabase();
const githubUserName = gitHubUser.login;
await issueScraper(githubUserName, supabase, accessToken || undefined);
}
}
330 changes: 330 additions & 0 deletions src/home/scraper/issue-scraper.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,330 @@
import { SupabaseClient } from "@supabase/supabase-js";
import { VoyageAIClient } from "voyageai";
import { Octokit } from "@octokit/rest";
import markdownit from "markdown-it";
import plainTextPlugin from "markdown-it-plain-text";

declare const VOYAGEAI_API_KEY: string; // @DEV: passed in at build time check build/esbuild-build.ts

interface MarkdownItWithPlainText extends markdownit {
plainText: string;
}

function markdownToPlainText(markdown: string | null): string | null {
if (!markdown) return markdown;
const md = markdownit() as MarkdownItWithPlainText;
md.use(plainTextPlugin);
md.render(markdown);
return md.plainText;
}

interface IssueMetadata {
nodeId: string;
number: number;
title: string;
body: string;
state: string;
repositoryName: string;
repositoryId: number;
assignees: string[];
authorId: number;
createdAt: string;
closedAt: string | null;
stateReason: string | null;
updatedAt: string;
}

interface IssueNode {
id: string;
number: number;
title: string;
body: string;
state: string;
stateReason: string | null;
createdAt: string;
updatedAt: string;
closedAt: string | null;
author: {
login: string;
} | null;
assignees: {
nodes: Array<{
login: string;
}>;
};
repository: {
id: string;
name: string;
owner: {
login: string;
};
};
}

interface GraphQlSearchResponse {
search: {
pageInfo: {
hasNextPage: boolean;
endCursor: string | null;
};
nodes: Array<IssueNode>;
};
}

const SEARCH_ISSUES_QUERY = `
query SearchIssues($searchText: String!, $after: String) {
search(
query: $searchText,
type: ISSUE,
first: 100,
after: $after
) {
pageInfo {
hasNextPage
endCursor
}
nodes {
... on Issue {
id
number
title
body
state
stateReason
createdAt
updatedAt
closedAt
author {
login
}
assignees(first: 10) {
nodes {
login
}
}
repository {
id
name
owner {
login
}
}
}
}
}
}
`;

async function fetchAuthorId(octokit: InstanceType<typeof Octokit>, login: string): Promise<number> {
try {
const response = await octokit.rest.users.getByUsername({ username: login });
return response.data.id;
} catch (error) {
console.error(`Error fetching author ID for ${login}:`, error);
return -1;
}
}

async function fetchUserIssues(octokit: InstanceType<typeof Octokit>, username: string): Promise<IssueNode[]> {
const allIssues: IssueNode[] = [];
let hasNextPage = true;
let cursor: string | null = null;

const searchText = `assignee:${username} is:issue is:closed`;

while (hasNextPage) {
const variables: { searchText: string; after?: string } = {
searchText,
};
if (cursor) {
variables.after = cursor;
}

const response: GraphQlSearchResponse = await octokit.graphql<GraphQlSearchResponse>(SEARCH_ISSUES_QUERY, variables);

const completedIssues = response.search.nodes.filter((issue) => issue.stateReason === "COMPLETED");
allIssues.push(...completedIssues);

hasNextPage = response.search.pageInfo.hasNextPage;
cursor = response.search.pageInfo.endCursor;

if (!cursor) break;
}

return allIssues;
}

// Pulls issues from GitHub and stores them in Supabase
export async function issueScraper(username: string, supabaseClient: SupabaseClient, token?: string): Promise<string> {
try {
// Check if 24 hours have passed since last fetch
const lastFetchKey = `lastFetch_${username}`;
const lastFetch = localStorage.getItem(lastFetchKey);
const now = Date.now();

if (lastFetch && (now - Number(lastFetch)) < 24 * 60 * 60 * 1000) {
return JSON.stringify({
success: true,
message: 'Skipping fetch - last fetch was less than 24 hours ago'
});
}

if (!username) {
throw new Error("Username is required");
}

if (VOYAGEAI_API_KEY === undefined) {
throw new Error("Required environment `VOYAGEAI_API_KEY` is missing");
}

const context = {
adapters: {},
logger: {
info: (message: string, data: Record<string, unknown>) => console.log("INFO:", message + ":", data),
error: (message: string, data: Record<string, unknown>) => console.error("ERROR:", message + ":", data),
},
octokit: new Octokit(token ? { auth: token } : {}),
};

//If supabaseClient is not passed as an argument, create a new SupabaseClient
// const supabase = supabaseClient ?? createSupabaseClient();

// function createSupabaseClient(): SupabaseClient {
// const url = process.env.SUPABASE_URL;
// const key = process.env.SUPABASE_KEY;

// if (!url || !key) {
// throw new Error("SUPABASE_URL and SUPABASE_KEY are required when supabaseClient is not provided");
// }

// return createClient(url, key);
// }
const supabase = supabaseClient;

const voyageClient = new VoyageAIClient({ apiKey: VOYAGEAI_API_KEY });
const issues = await fetchUserIssues(context.octokit, username);
const processedIssues: Array<{ issue: IssueMetadata; error?: string }> = [];

for (const issue of issues) {
try {
const authorId = issue.author?.login ? await fetchAuthorId(context.octokit, issue.author.login) : -1;
const repoOwner = issue.repository.owner.login;

const metadata: IssueMetadata = {
nodeId: issue.id,
number: issue.number,
title: issue.title || "",
body: issue.body || "",
state: issue.state,
stateReason: issue.stateReason,
repositoryName: issue.repository.name,
repositoryId: parseInt(issue.repository.id),
assignees: (issue.assignees?.nodes || []).map((assignee) => assignee.login),
authorId,
createdAt: issue.createdAt,
closedAt: issue.closedAt,
updatedAt: issue.updatedAt,
};
const markdown = metadata.body + " " + metadata.title;
const plaintext = markdownToPlainText(markdown);
if (!plaintext || plaintext === null) {
throw new Error("Error converting markdown to plaintext");
}
const embeddingObject = await voyageClient.embed({
input: markdown,
model: "voyage-large-2-instruct",
inputType: "document",
});
const embedding = (embeddingObject.data && embeddingObject.data[0]?.embedding) || {};
const payload = {
issue: metadata,
action: "created",
sender: {
login: username,
},
repository: {
id: parseInt(issue.repository.id),
node_id: issue.repository.id,
name: issue.repository.name,
full_name: `${repoOwner}/${issue.repository.name}`,
owner: {
login: repoOwner,
id: authorId,
type: "User",
site_admin: false,
},
},
};
//Check if the user is authenticated
if (!supabase.auth.getUser()) {
throw new Error("User is not authenticated");
}

const { error } = await supabase.from("issues").upsert({
id: metadata.nodeId,
markdown,
plaintext,
embedding: JSON.stringify(embedding),
author_id: metadata.authorId,
modified_at: metadata.updatedAt,
payload: payload,
});

processedIssues.push({
issue: metadata,
error: error ? `Error storing issue: ${error.message}` : undefined,
});
} catch (error) {
processedIssues.push({
issue: {
nodeId: issue.id,
number: issue.number,
title: issue.title || "",
body: issue.body || "",
state: issue.state,
stateReason: issue.stateReason,
repositoryName: issue.repository.name,
repositoryId: parseInt(issue.repository.id),
assignees: [],
authorId: -1,
createdAt: issue.createdAt,
closedAt: issue.closedAt,
updatedAt: issue.updatedAt,
},
error: `Error processing issue: ${error instanceof Error ? error.message : "Unknown error"}`,
});
}
}

// Update last fetch timestamp
localStorage.setItem(lastFetchKey, now.toString());

return JSON.stringify(
{
success: true,
stats: {
storageSuccessful: processedIssues.filter((p) => !p.error).length,
storageFailed: processedIssues.filter((p) => p.error).length,
},
errors: processedIssues
.filter((p) => p.error)
.map((p) => ({
type: "storage",
name: `${p.issue.repositoryName}#${p.issue.number}`,
error: p.error,
})),
issues: processedIssues.map((p) => ({
number: p.issue.number,
title: p.issue.title,
repo: p.issue.repositoryName,
error: p.error,
})),
},
null,
2
);
} catch (error) {
console.error("Error in issueScraper:", error);
throw error;
}
}

0 comments on commit 9f22bfb

Please sign in to comment.