Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,4 @@ _bot.json
bot.json
backup
scratch
indexes/wiki/*.json
indexes/wiki/*.html
indexes/cppref/*.json
indexes/cppref/*.html
indexes/**/*.html
15 changes: 13 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,17 @@ run-dev-container: build-dev-container ## Runs the dev container container
dev: build ## Runs in dev mode
node --enable-source-maps build/src/main.js

.PHONY: wiki-index
wiki-index: prereqs ## Generates embeddings index for wiki articles
.PHONY: embeddings
embeddings: cppref-embeddings man7-embeddings wiki-embeddings ## Generates embeddings index for wiki/cppref/man7 articles

.PHONY: cppref-embeddings
cppref-embeddings: prereqs ## Generates embeddings index for cppref articles
npx tsx indexes/cppref/generate-cppref-embeddings.ts

.PHONY: man7-embeddings
man7-embeddings: prereqs ## Generates embeddings index for man7 articles
npx tsx indexes/man7/generate-man7-embeddings.ts

.PHONY: wiki-embeddings
wiki-embeddings: prereqs ## Generates embeddings index for wiki articles
npx tsx indexes/wiki/generate-wiki-embeddings.ts
5,864 changes: 5,864 additions & 0 deletions indexes/cppref/embeddings.json

Large diffs are not rendered by default.

11 changes: 7 additions & 4 deletions indexes/cppref/generate-cppref-embeddings.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import * as fs from "fs";
import {
create_embedding_pipeline,
get_or_create_embedding_pipeline,
generate_embedding,
EMBEDDING_MODEL,
} from "../../src/utils/wiki-embeddings.js";
round_embeddings,
serialize_embeddings_data,
} from "../../src/utils/embeddings.js";

const INDEX_DIR = "indexes/cppref";

Expand Down Expand Up @@ -51,7 +53,7 @@ function create_cppref_embedding_content(entry: CpprefEntry): string {
console.log(`Loaded ${all_entries.length} cppref entries`);

console.log("Loading embedding model (this may take a while on first run)...");
const extractor = await create_embedding_pipeline();
const extractor = await get_or_create_embedding_pipeline();

console.log("Generating embeddings...");
const embeddings: Record<string, number[]> = {};
Expand All @@ -76,7 +78,8 @@ function create_cppref_embedding_content(entry: CpprefEntry): string {
embeddings,
};
const output_path = `${INDEX_DIR}/embeddings.json`;
await fs.promises.writeFile(output_path, JSON.stringify(output_data, null, 2));
output_data.embeddings = round_embeddings(output_data.embeddings);
await fs.promises.writeFile(output_path, serialize_embeddings_data(output_data));
console.log(`Saved embeddings to ${output_path}`);
console.log(`Model: ${EMBEDDING_MODEL}, Dimension: ${embedding_dimension}`);
})();
7,179 changes: 7,179 additions & 0 deletions indexes/man7/embeddings.json

Large diffs are not rendered by default.

64 changes: 64 additions & 0 deletions indexes/man7/generate-man7-embeddings.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
import * as fs from "fs";
import {
get_or_create_embedding_pipeline,
generate_embedding,
EMBEDDING_MODEL,
round_embeddings,
serialize_embeddings_data,
} from "../../src/utils/embeddings.js";

const INDEX_DIR = "indexes/man7";

interface Man7Entry {
page_title: string;
path: string;
short_description?: string;
synopsis?: string;
}

function create_man7_embedding_content(entry: Man7Entry): string {
const parts = [entry.page_title];
if (entry.short_description) {
parts.push(entry.short_description);
}
return parts.join("\n");
}

(async () => {
console.log("Loading man7 index...");
const index_path = `${INDEX_DIR}/man7_index.json`;
const index_data: Man7Entry[] = JSON.parse(await fs.promises.readFile(index_path, { encoding: "utf-8" }));

console.log(`Loaded ${index_data.length} man7 entries`);

console.log("Loading embedding model (this may take a while on first run)...");
const extractor = await get_or_create_embedding_pipeline();

console.log("Generating embeddings...");
const embeddings: Record<string, number[]> = {};

let count = 0;
for (const entry of index_data) {
const content = create_man7_embedding_content(entry);
embeddings[entry.page_title] = await generate_embedding(content, extractor);
count++;
if (count % 100 === 0 || count === index_data.length) {
console.log(` Generated ${count}/${index_data.length} embeddings...`);
}
}

console.log(`Saving embeddings`);
const embedding_dimension = embeddings[Object.keys(embeddings)[0]].length;
const output_data = {
model_info: {
model: EMBEDDING_MODEL,
dimension: embedding_dimension,
},
embeddings,
};
const output_path = `${INDEX_DIR}/embeddings.json`;
output_data.embeddings = round_embeddings(output_data.embeddings);
await fs.promises.writeFile(output_path, serialize_embeddings_data(output_data));
console.log(`Saved embeddings to ${output_path}`);
console.log(`Model: ${EMBEDDING_MODEL}, Dimension: ${embedding_dimension}`);
})();
113 changes: 113 additions & 0 deletions indexes/wiki/embeddings.json

Large diffs are not rendered by default.

19 changes: 13 additions & 6 deletions indexes/wiki/generate-wiki-embeddings.ts
Original file line number Diff line number Diff line change
@@ -1,13 +1,19 @@
import * as fs from "fs";
import { globIterate } from "glob";
import { parse_article, WIKI_ARTICLES_PATH, WikiArticle } from "../../src/modules/wheatley/components/wiki.js";
import {
parse_article,
WIKI_ARTICLES_PATH,
WikiArticle,
create_embedding_content,
} from "../../src/modules/wheatley/components/wiki.js";
import { load_wiki_web_articles } from "../../src/modules/wheatley/wiki-article-loader.js";
import {
create_embedding_pipeline,
get_or_create_embedding_pipeline,
generate_embedding,
create_embedding_content,
EMBEDDING_MODEL,
} from "../../src/utils/wiki-embeddings.js";
round_embeddings,
serialize_embeddings_data,
} from "../../src/utils/embeddings.js";

const INDEX_DIR = "indexes/wiki";

Expand Down Expand Up @@ -43,7 +49,7 @@ const INDEX_DIR = "indexes/wiki";
console.log(`Total articles: ${Object.keys(articles).length}`);

console.log("Loading embedding model (this may take a while on first run)...");
const extractor = await create_embedding_pipeline();
const extractor = await get_or_create_embedding_pipeline();

console.log("Generating embeddings...");
const embeddings: Record<string, number[]> = {};
Expand All @@ -66,7 +72,8 @@ const INDEX_DIR = "indexes/wiki";
embeddings,
};
const output_path = `${INDEX_DIR}/embeddings.json`;
await fs.promises.writeFile(output_path, JSON.stringify(output_data, null, 2));
output_data.embeddings = round_embeddings(output_data.embeddings);
await fs.promises.writeFile(output_path, serialize_embeddings_data(output_data));
console.log(`Saved embeddings to ${output_path}`);
console.log(`Model: ${EMBEDDING_MODEL}, Dimension: ${embedding_dimension}`);
})();
Loading