diff --git a/plugins/vite-plugin-llms-txt.ts b/plugins/vite-plugin-llms-txt.ts new file mode 100644 index 0000000..7556782 --- /dev/null +++ b/plugins/vite-plugin-llms-txt.ts @@ -0,0 +1,105 @@ +/** + * Vite plugin for generating llms.txt + * + * Generates an llms.txt file at build time that describes the Malloy models + * and schema to help LLMs understand the data explorer site. + * + * In dev mode, serves llms.txt dynamically via middleware. + */ + +import type { Plugin, ResolvedConfig, ViteDevServer } from "vite"; +import * as fs from "node:fs/promises"; +import * as path from "node:path"; +import { + extractModelsSchema, + getDataFiles, + getNotebooks, + generateLlmsTxtContent, +} from "../src/llms-txt"; + +export interface LlmsTxtPluginOptions { + siteTitle?: string; + modelsDir?: string; +} + +export default function llmsTxtPlugin( + options: LlmsTxtPluginOptions = {}, +): Plugin { + const { siteTitle = "Malloy Data Explorer", modelsDir = "models" } = options; + + let config: ResolvedConfig; + + async function generateContent(): Promise { + const modelsDirPath = path.join(config.root, modelsDir); + + const [models, dataFiles, notebooks] = await Promise.all([ + extractModelsSchema(modelsDirPath), + getDataFiles(modelsDirPath), + getNotebooks(modelsDirPath), + ]); + + return generateLlmsTxtContent({ + siteTitle, + basePath: config.base, + models, + dataFiles, + notebooks, + }); + } + + return { + name: "vite-plugin-llms-txt", + + configResolved(resolvedConfig) { + config = resolvedConfig; + }, + + // DEV MODE: Serve llms.txt dynamically + configureServer(server: ViteDevServer) { + server.middlewares.use((req, res, next) => { + if (req.url === "/llms.txt") { + void (async () => { + try { + // Regenerate on each request in dev mode for hot reloading + const content = await generateContent(); + res.setHeader("Content-Type", "text/plain; charset=utf-8"); + res.end(content); + } catch (error) { + console.error("[llms.txt] Error generating content:", error); + res.statusCode = 500; + res.end( + `Error generating llms.txt: ${error instanceof Error ? error.message : String(error)}`, + ); + } + })(); + return; + } + next(); + }); + }, + + // BUILD MODE: Generate file after bundle + async closeBundle() { + if (process.env["VITEST"] || process.env["NODE_ENV"] === "test") { + return; + } + if (config.command !== "build") return; + + try { + const content = await generateContent(); + + const outputPath = path.join( + config.root, + config.build.outDir, + "llms.txt", + ); + await fs.writeFile(outputPath, content, "utf-8"); + + console.log(`[llms.txt] Generated ${outputPath}`); + } catch (error) { + console.error("[llms.txt] Error generating file:", error); + throw error; + } + }, + }; +} diff --git a/proposals/adr/001-llms-txt-build-time-generation.md b/proposals/adr/001-llms-txt-build-time-generation.md new file mode 100644 index 0000000..a503cbe --- /dev/null +++ b/proposals/adr/001-llms-txt-build-time-generation.md @@ -0,0 +1,296 @@ +# ADR 001: Build-Time Generation for llms.txt and Future Static Routes + +## Status + +Accepted + +## Context + +The Malloy Data Explorer needs to generate an `llms.txt` file that describes the schema of all Malloy models for LLM consumption. We evaluated two approaches: + +1. **Build-time generation** - Generate static files during the build process +2. **Runtime generation** - Generate content dynamically in the browser + +### Key Technical Constraint: Malloy Requires Database Connection for Schema Resolution + +**Critical Discovery**: Malloy's compilation process is not just syntax parsing. It requires an active database connection to: + +- Resolve table schemas from the underlying database +- Infer field types from actual data structures +- Validate source definitions against real tables +- Build the complete semantic model with type information + +This means: + +```typescript +// This is NOT possible without a database: +const model = await runtime.loadModel(modelUrl); // Needs DB connection +const explores = model.exportedExplores; // Contains resolved schema + +// The model compilation process queries the database: +// "DESCRIBE table_name" or equivalent to get field information +``` + +**You cannot extract accurate schema information from Malloy models without database access.** + +### Why Build-Time Makes Sense + +Since we must connect to a database anyway, we have two options: + +1. **Node.js DuckDB at build time** (current approach) + - Fast native DuckDB + - All data available locally + - One-time compilation cost + +2. **Browser WASM DuckDB at runtime** + - Slower WASM performance + - Must load data into browser + - Compilation cost on every page load + - SEO challenges + +## Decision + +**We will generate llms.txt (and future static content) at build time.** + +### Primary Reasons + +#### 1. **SEO & Standards Compliance** + +- LLM crawlers expect `/llms.txt` as a static, plain-text file at the root +- Standard practice across the industry (like `robots.txt`, `sitemap.xml`) +- Immediate availability without JavaScript execution +- Some crawlers may not execute JavaScript or wait for async operations + +#### 2. **Performance** + +- Zero runtime overhead for serving llms.txt +- No browser-side model compilation required +- Instant response with static file serving +- No DuckDB WASM initialization cost + +#### 3. **Database Requirement** + +- Malloy compilation requires database connection regardless +- Node.js DuckDB is faster than WASM DuckDB +- Data files are available locally during build +- Single compilation vs. compilation on every user visit + +#### 4. **Build-Time Data Availability** + +- All models are known at build time (in `models/` directory) +- All data files are available locally (in `models/data/`) +- Schema is static - doesn't change per user or per request +- Perfect use case for static site generation + +## Consequences + +### Positive + +✅ **Fast serving** - Static files are served instantly +✅ **SEO-friendly** - Standard file location and format +✅ **No runtime cost** - Zero JavaScript execution needed +✅ **Reliable** - Works even if browser has issues +✅ **Standards-compliant** - Follows web best practices +✅ **Efficient** - Compile once, serve many times + +### Negative + +❌ **Build complexity** - Requires Node.js DuckDB and async build process +❌ **Build time** - Adds time to the build (currently acceptable) +❌ **Stale data** - If models change, must rebuild (acceptable for static data) + +### Neutral + +⚪ **Code duplication** - Some schema extraction logic differs from runtime +⚪ **Two environments** - Must handle both Node.js and browser contexts + +## Future Direction: Static Site Generation (SSG) + +This decision establishes a pattern for future optimization: + +### Long-Term Goal: Pre-render Most Routes at Build Time + +Since we have all the data at build time, we should generate static HTML for: + +``` +/ → Home page (list of models) +/model/{modelName} → Model schema page +/model/{modelName}/preview/{sourceName} → Preview pages +/model/{modelName}/query/{queryName} → Named query results +``` + +### Benefits of Full SSG + +1. **Instant page loads** - No client-side data fetching or compilation +2. **Progressive enhancement** - Pages work without JavaScript, enhanced with it +3. **SEO** - All content crawlable and indexable +4. **Performance** - Compile models once at build, not per-user +5. **Caching** - Static HTML caches perfectly on CDNs +6. **Resilience** - Site works even if DuckDB WASM fails to load + +### SSG Implementation Path + +```typescript +// vite.config.ts - Future enhancement +export default { + plugins: [ + llmsTxtPlugin(), + // Future: Static route generation + ssgPlugin({ + routes: async () => { + const models = await loadAllModels(); + return [ + "/", + ...models.flatMap((model) => [ + `/model/${model.name}`, + ...model.sources.map( + (s) => `/model/${model.name}/preview/${s.name}`, + ), + ...model.queries.map((q) => `/model/${model.name}/query/${q.name}`), + ]), + ]; + }, + }), + ], +}; +``` + +### Hybrid Approach (Best of Both Worlds) + +``` +Static (build-time): +- /llms.txt ✅ +- / (home page) +- /model/{name} (schema view) +- /model/{name}/preview/{source} (data preview) +- /model/{name}/query/{namedQuery} (pre-defined queries) + +Dynamic (runtime): +- /model/{name}/explorer/{source}?query=... (custom queries) +- Any user interaction/filtering +``` + +This gives us: + +- Fast initial loads +- SEO for all content +- Interactivity where needed +- Best performance characteristics + +## Technical Implementation + +### Current: llms.txt Generation + +```typescript +// plugins/vite-plugin-llms-txt.ts +async closeBundle() { + // 1. Create Node.js DuckDB connection + const connection = new DuckDBConnection({ + name: "llms-txt-build", + workingDirectory: modelsDir, + }); + + // 2. Create Malloy runtime with database connection + const runtime = new SingleConnectionRuntime({ connection, urlReader }); + + // 3. Compile models (requires DB to resolve schemas) + const models = await Promise.all( + malloyFiles.map(async (file) => { + const model = await runtime.loadModel(fileUrl); // DB access here + return extractFromModel(model); + }) + ); + + // 4. Generate static content + const content = generateLlmsTxtContent({ models, dataFiles }); + await fs.writeFile('dist/llms.txt', content); +} +``` + +### Why Runtime Generation Doesn't Work + +```typescript +// ❌ This LOOKS simpler but doesn't work: +function LlmsTxtRoute() { + // Problem 1: Must load ALL models (slow) + // Problem 2: Must compile ALL models with DuckDB WASM (slow) + // Problem 3: /llms.txt needs to be at root, not /#/llms.txt + // Problem 4: Requires JavaScript execution + // Problem 5: Not standard - crawlers expect static file + + const models = await loadAllModels(); // Slow + const compiled = await compileAll(models); // Very slow + return generateLlmsTxt(compiled); +} +``` + +### Alternative Considered: Service Worker + +```typescript +// ❌ Also problematic: +self.addEventListener("fetch", (event) => { + if (event.request.url.endsWith("/llms.txt")) { + // Still requires compiling models in browser + // Still needs DuckDB WASM + // Adds service worker complexity + // Delays first response + } +}); +``` + +## Monitoring & Success Metrics + +- Build time for llms.txt generation (current: ~500ms, acceptable) +- llms.txt file size (current: ~5KB, excellent) +- Time to first byte for /llms.txt (should be <50ms) +- LLM crawler success rate (monitor logs) + +## References + +- [llms.txt Standard](https://llmstxt.org/) +- Malloy compilation architecture (requires DB connection) +- Vite SSG plugins: `vite-plugin-ssr`, `vite-ssg` +- [Schema.tsx](../src/Schema.tsx) - Runtime schema rendering (different use case) +- [schema-extractor.ts](../src/llms-txt/schema-extractor.ts) - Build-time extraction + +## Alternatives Considered + +### 1. Runtime Generation (Rejected) + +- **Why rejected**: SEO issues, performance overhead, non-standard location +- **When to reconsider**: If schema becomes user-specific or dynamic + +### 2. Mixed: Parse at build, compile at runtime (Rejected) + +- **Why rejected**: Malloy requires DB connection to parse accurately +- **Misconception**: You can't just parse the `.malloy` syntax file to get schema +- **Reality**: Field types, table schemas come from the database, not the model file + +### 3. External Service (Rejected) + +- **Why rejected**: Adds infrastructure complexity, latency +- **When to consider**: If build times become problematic (>5 seconds) + +## Notes + +- Dev mode uses dynamic generation for hot-reloading (acceptable trade-off) +- Both dev and prod use the same generation logic, just triggered differently +- This pattern should extend to other routes as we implement full SSG +- The goal is to serve the entire site as static HTML with progressive enhancement + +## Decision Date + +2026-02-01 + +## Decision Makers + +- Development team +- Architecture review + +## Supersedes + +N/A (First ADR) + +## Superseded By + +N/A (Current decision) diff --git a/proposals/llm_context.md b/proposals/llm_context.md new file mode 100644 index 0000000..f1892c2 --- /dev/null +++ b/proposals/llm_context.md @@ -0,0 +1,7 @@ +# Problem + +LLMs cannot directly analyze the site contents to understand the data models used and their schema, thus not able generate malloy queries. + +## Propasal + +Create `llms.txt` build at build time that uses the internal vite import.meta to gather context of malloy models and schema. Structure this for llm consumption in a single file so that llms can fully comprehend the data models support by the site, the site sturcture, how to issue arbitary queries and download data. diff --git a/src/llms-txt/generator.ts b/src/llms-txt/generator.ts new file mode 100644 index 0000000..24e0f63 --- /dev/null +++ b/src/llms-txt/generator.ts @@ -0,0 +1,203 @@ +/** + * LLMs.txt content generator + * + * Generates the llms.txt file content from extracted model schema + */ + +import type { ExtractedModel } from "./types"; + +export interface GeneratorOptions { + siteTitle: string; + basePath: string; + models: ExtractedModel[]; + dataFiles: string[]; + notebooks: string[]; +} + +export function generateLlmsTxtContent(options: GeneratorOptions): string { + const { siteTitle, basePath, models, dataFiles, notebooks } = options; + + const sections = [ + generateHeader(siteTitle, basePath), + generateOverview(siteTitle, basePath, models, dataFiles, notebooks), + generateModelsSection(models, basePath), + generateMalloyQueryGuide(), + ]; + + return sections.join("\n\n"); +} + +function generateHeader(siteTitle: string, basePath: string): string { + const base = basePath.endsWith("/") ? basePath.slice(0, -1) : basePath; + return `# ${siteTitle} + +> Malloy Data Explorer - Static web app for exploring semantic data models +> All queries run in-browser using DuckDB WASM + +**Site URL:** \`${base}/\``; +} + +function generateOverview( + _siteTitle: string, + basePath: string, + models: ExtractedModel[], + dataFiles: string[], + notebooks: string[], +): string { + const base = basePath.endsWith("/") ? basePath.slice(0, -1) : basePath; + + // Content summary + const contentItems = [ + `${String(models.length)} Malloy model${models.length !== 1 ? "s" : ""}`, + `${String(dataFiles.length)} data file${dataFiles.length !== 1 ? "s" : ""}`, + ...(notebooks.length > 0 + ? [ + `${String(notebooks.length)} notebook${notebooks.length !== 1 ? "s" : ""}`, + ] + : []), + ]; + + // Data files list (compact) + const dataFilesList = + dataFiles.length > 0 ? `\n\n**Data Files:** ${dataFiles.join(", ")}` : ""; + + // Notebooks list (compact - just names) + const notebooksList = + notebooks.length > 0 ? `\n\n**Notebooks:** ${notebooks.join(", ")}` : ""; + + return `## Overview + +**Content:** ${contentItems.join(" • ")} +**Capabilities:** Browse schemas • Preview data • Build queries • Download results (CSV/JSON)${dataFilesList}${notebooksList} + +## URL Patterns + +All URLs with \`/#/\` prefix return HTML pages. \`/downloads/\` URLs return raw files. + +| Pattern | Returns | Description | +|---------|---------|-------------| +| \`${base}/#/\` | HTML | Home - list all models | +| \`${base}/#/model/{model}\` | HTML | Model schema browser | +| \`${base}/#/model/{model}/preview/{source}\` | HTML | Preview source data (50 rows) | +| \`${base}/#/model/{model}/explorer/{source}\` | HTML | Interactive query builder | +| \`${base}/#/model/{model}/explorer/{source}?query={malloy}&run=true\` | HTML | Execute query, show results | +| \`${base}/#/model/{model}/query/{queryName}\` | HTML | Run named query, show results | +| \`${base}/#/notebook/{notebook}\` | HTML | View notebook with queries/visualizations | +| \`${base}/downloads/models/{model}.malloy\` | Text | Download model source file | +| \`${base}/downloads/notebooks/{notebook}.malloynb\` | Text | Download notebook file | +| \`${base}/downloads/data/{file}\` | File | Download data file (CSV/Parquet/JSON/Excel) |`; +} + +function generateModelsSection( + models: ExtractedModel[], + basePath: string, +): string { + if (models.length === 0) { + return "## Models\n\nNo models available."; + } + + const base = basePath.endsWith("/") ? basePath.slice(0, -1) : basePath; + + const modelSections = models.map((model) => { + const sourceSections = model.sources + .map((source) => { + // Group fields by type + const groupByType = (fields: { name: string; type: string }[]) => { + const grouped = new Map(); + for (const field of fields) { + const existing = grouped.get(field.type) ?? []; + existing.push(`\`${field.name}\``); + grouped.set(field.type, existing); + } + return grouped; + }; + + const dimsByType = groupByType(source.dimensions); + const measuresByType = groupByType(source.measures); + + const dims = + dimsByType.size > 0 + ? Array.from(dimsByType.entries()) + .map(([type, names]) => `${type}: ${names.join(", ")}`) + .join(" • ") + : "none"; + + const measures = + measuresByType.size > 0 + ? Array.from(measuresByType.entries()) + .map(([type, names]) => `${type}: ${names.join(", ")}`) + .join(" • ") + : "none"; + + const views = + source.views.length > 0 + ? source.views.map((v) => `\`${v.name}\``).join(", ") + : "none"; + + const tableInfo = source.tablePath + ? ` | Table: \`${source.tablePath}\`` + : ""; + + return `**${source.name}**${tableInfo} +- Dims: ${dims} +- Measures: ${measures} +- Views: ${views}`; + }) + .join("\n\n"); + + const queriesInfo = + model.queries.length > 0 + ? `\n**Named Queries:** ${model.queries.map((q) => `\`${q.name}\``).join(", ")}` + : ""; + + const urls = `[Browse](${base}/#/model/${encodeURIComponent(model.name)}) | [Download](${base}/downloads/models/${encodeURIComponent(model.name)}.malloy)`; + + return `### ${model.name} +${urls}${queriesInfo} + +**Sources:** +${sourceSections}`; + }); + + return `## Models + +${modelSections.join("\n\n---\n\n")}`; +} + +function generateMalloyQueryGuide(): string { + return `## Malloy Query Syntax + +**Basic Structure:** +\`\`\`malloy +run: source_name -> { + group_by: field1, field2 + aggregate: measure1, measure2 + where: condition + order_by: field desc + limit: 100 +} +\`\`\` + +**Common Operations:** +- Select all: \`source -> { select: * }\` +- Group: \`source -> { group_by: field }\` +- Aggregate: \`source -> { aggregate: measure }\` +- Filter: \`source -> { where: field > value }\` +- Sort: \`source -> { order_by: field desc }\` +- Limit: \`source -> { limit: 10 }\` +- Run view: \`source -> view_name\` +- Nest: \`{ nest: name is { group_by: field; aggregate: measure } }\` + +**Time Granularity:** \`.year\`, \`.quarter\`, \`.month\`, \`.week\`, \`.day\` (e.g., \`order_date.month\`) + +**Example Query:** +\`\`\`malloy +run: orders -> { + group_by: category + aggregate: order_count, total_revenue + where: status = 'Delivered' + order_by: total_revenue desc + limit: 10 +} +\`\`\``; +} diff --git a/src/llms-txt/index.ts b/src/llms-txt/index.ts new file mode 100644 index 0000000..6d10e05 --- /dev/null +++ b/src/llms-txt/index.ts @@ -0,0 +1,20 @@ +/** + * LLMs.txt generation module + * + * Exports functions for generating llms.txt content from Malloy models + */ + +export { + extractModelsSchema, + getDataFiles, + getNotebooks, +} from "./schema-extractor"; +export { generateLlmsTxtContent } from "./generator"; +export type { + ExtractedModel, + ExtractedSource, + ExtractedField, + ExtractedQuery, + LlmsTxtOptions, +} from "./types"; +export type { GeneratorOptions } from "./generator"; diff --git a/src/llms-txt/schema-extractor.ts b/src/llms-txt/schema-extractor.ts new file mode 100644 index 0000000..b724f0b --- /dev/null +++ b/src/llms-txt/schema-extractor.ts @@ -0,0 +1,258 @@ +/** + * Schema extractor using Malloy libraries + * + * Compiles Malloy models and extracts schema information (sources, fields, queries) + * for generating llms.txt + */ + +import * as malloy from "@malloydata/malloy"; +import { isSourceDef } from "@malloydata/malloy"; +import { DuckDBConnection } from "@malloydata/db-duckdb"; +import * as fs from "node:fs/promises"; +import * as path from "node:path"; +import type { + ExtractedModel, + ExtractedSource, + ExtractedField, + ExtractedQuery, + ExtractedView, +} from "./types"; + +/** + * Extract schema information from all Malloy models in a directory + */ +export async function extractModelsSchema( + modelsDir: string, +): Promise { + try { + await fs.access(modelsDir); + } catch { + return []; + } + + const files = await fs.readdir(modelsDir); + const malloyFiles = files + .filter((f) => f.endsWith(".malloy")) + .map((f) => path.join(modelsDir, f)); + + if (malloyFiles.length === 0) { + return []; + } + + // Create a DuckDB connection for model compilation + // Set workingDirectory so DuckDB can find data files referenced in models + const connection = new DuckDBConnection({ + name: "llms-txt-build", + workingDirectory: modelsDir, + }); + + // Create runtime with a URL reader that loads from the filesystem + const runtime = new malloy.SingleConnectionRuntime({ + connection, + urlReader: { + readURL: async (url: URL) => { + // Handle file:// URLs pointing to our models directory + const fileName = url.pathname.split("/").pop() ?? ""; + const filePath = path.join(modelsDir, fileName); + + try { + await fs.access(filePath); + } catch { + throw new Error(`Model file not found: ${filePath}`); + } + + const contents = await fs.readFile(filePath, "utf-8"); + return { contents }; + }, + }, + }); + + // Process all models in parallel - they're read-only operations + const results = await Promise.all( + malloyFiles.map(async (filePath) => { + const modelName = path.basename(filePath, ".malloy"); + const modelCode = await fs.readFile(filePath, "utf-8"); + + try { + const modelUrl = new URL(`file:///${modelName}.malloy`); + const modelMaterializer = runtime.loadModel(modelUrl); + const model = await modelMaterializer.getModel(); + + return extractFromModel(modelName, model, modelCode); + } catch (error) { + // Log but continue with other models + console.warn( + `[llms.txt] Warning: Could not compile model ${modelName}:`, + error instanceof Error ? error.message : error, + ); + return null; + } + }), + ); + + // Clean up connection + await connection.close(); + + // Filter out failed models (null values) and return + return results.filter((result): result is ExtractedModel => result !== null); +} + +/** + * Extract schema information from a compiled Malloy model + */ +function extractFromModel( + name: string, + model: malloy.Model, + rawCode: string, +): ExtractedModel { + const modelDef = model._modelDef; + + // Extract sources from exported explores + const sources: ExtractedSource[] = model.exportedExplores.map((explore) => { + const dimensions: ExtractedField[] = []; + const measures: ExtractedField[] = []; + const views: ExtractedView[] = []; + + // Iterate through all fields and categorize them + for (const field of explore.allFields) { + // Skip hidden fields + if (isFieldHidden(field)) { + continue; + } + + if (field.isQueryField()) { + // This is a view + views.push({ + name: field.name, + }); + } else if (field.isAtomicField() && field.isCalculation()) { + // This is a measure + measures.push({ + name: field.name, + type: getFieldType(field), + }); + } else if (field.isAtomicField()) { + // This is a dimension + dimensions.push({ + name: field.name, + type: getFieldType(field), + }); + } + // Skip explore/join fields for now + } + + const tablePath = getTablePath(modelDef, explore.name); + + return { + name: explore.name, + ...(tablePath !== undefined && { tablePath }), + dimensions, + measures, + views, + }; + }); + + // Extract named queries + const queries: ExtractedQuery[] = model.namedQueries.map((q) => ({ + name: q.name, + })); + + return { + name, + sources, + queries, + rawCode, + }; +} + +/** + * Get the table path for a source from the model definition + */ +function getTablePath( + modelDef: malloy.ModelDef, + sourceName: string, +): string | undefined { + const source = modelDef.contents[sourceName]; + if (source !== undefined && isSourceDef(source) && source.type === "table") { + // Remove duckdb: prefix if present + return source.tablePath.replace(/^duckdb:/, ""); + } + return undefined; +} + +/** + * Get a human-readable type string for a field + */ +function getFieldType(field: malloy.Field): string { + if (field.isAtomicField()) { + return field.type; + } + if (field.isExplore()) { + return "explore"; + } + return "unknown"; +} + +/** + * Check if a field should be hidden based on its tags + * Simplified version of the schema-utils.ts isFieldHidden + */ +function isFieldHidden(field: malloy.Field): boolean { + const { name, parentExplore } = field; + + try { + const { tag } = parentExplore.tagParse(); + const hiddenStrings = + tag + .array("hidden") + ?.map((_tag) => _tag.text()) + .filter((t): t is string => typeof t === "string") || []; + + const patternText = tag.text("hidden", "pattern"); + const pattern = patternText ? new RegExp(patternText) : undefined; + + return pattern?.test(name) || hiddenStrings.includes(name); + } catch { + return false; + } +} + +/** + * Get list of data files in the models/data directory + */ +export async function getDataFiles(modelsDir: string): Promise { + const dataDir = path.join(modelsDir, "data"); + + try { + await fs.access(dataDir); + } catch { + return []; + } + + const files = await fs.readdir(dataDir); + const extensions = [ + ".csv", + ".parquet", + ".json", + ".jsonl", + ".ndjson", + ".xlsx", + ]; + return files.filter((f) => extensions.some((ext) => f.endsWith(ext))); +} + +/** + * Get list of notebook files in the models directory + */ +export async function getNotebooks(modelsDir: string): Promise { + try { + await fs.access(modelsDir); + } catch { + return []; + } + + const files = await fs.readdir(modelsDir); + return files + .filter((f) => f.endsWith(".malloynb")) + .map((f) => path.basename(f, ".malloynb")); +} diff --git a/src/llms-txt/types.ts b/src/llms-txt/types.ts new file mode 100644 index 0000000..c7919ed --- /dev/null +++ b/src/llms-txt/types.ts @@ -0,0 +1,36 @@ +/** + * Types for llms.txt generation + */ + +export interface ExtractedModel { + name: string; + sources: ExtractedSource[]; + queries: ExtractedQuery[]; + rawCode: string; +} + +export interface ExtractedSource { + name: string; + tablePath?: string; + dimensions: ExtractedField[]; + measures: ExtractedField[]; + views: ExtractedView[]; +} + +export interface ExtractedView { + name: string; +} + +export interface ExtractedField { + name: string; + type: string; +} + +export interface ExtractedQuery { + name: string; +} + +export interface LlmsTxtOptions { + siteTitle?: string; + modelsDir?: string; +} diff --git a/tsconfig.node.json b/tsconfig.node.json index 8f8e484..a60eb35 100644 --- a/tsconfig.node.json +++ b/tsconfig.node.json @@ -32,5 +32,10 @@ "noImplicitThis": true, "noPropertyAccessFromIndexSignature": true }, - "include": ["vite.config.ts", "playwright.config.ts", "plugins/**/*.ts"] + "include": [ + "vite.config.ts", + "playwright.config.ts", + "src/llms-txt/**/*.ts", + "plugins/**/*.ts" + ] } diff --git a/vite.config.ts b/vite.config.ts index 67534f9..68616ac 100644 --- a/vite.config.ts +++ b/vite.config.ts @@ -3,12 +3,13 @@ import { defineConfig, type UserConfig } from "vite"; import react from "@vitejs/plugin-react"; import svgr from "vite-plugin-svgr"; import copyDownloadsPlugin from "./plugins/vite-plugin-copy-downloads"; +import llmsTxtPlugin from "./plugins/vite-plugin-llms-txt"; // https://vite.dev/config/ const config: UserConfig = defineConfig({ // NOTE: THIS PATH MUST END WITH A TRAILING SLASH base: process.env["BASE_PUBLIC_PATH"] ?? "/", - plugins: [react(), svgr(), copyDownloadsPlugin()], + plugins: [react(), svgr(), copyDownloadsPlugin(), llmsTxtPlugin()], define: { "process.env": {}, },