Skip to content

Commit 4b89008

Browse files
committed
feat(app): add file uploads
1 parent c650d1c commit 4b89008

25 files changed

+1035
-86
lines changed

.prettierignore

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,4 +35,7 @@ coverage
3535
*.swp
3636

3737
# Ignore all files with the .DS_Store extension (macOS specific)
38-
.DS_Store
38+
.DS_Store
39+
40+
# Ignore all files in uploads directory
41+
uploads

backend.dockerfile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ COPY package.json /home/perplexica/
99
COPY yarn.lock /home/perplexica/
1010

1111
RUN mkdir /home/perplexica/data
12+
RUN mkdir /home/perplexica/uploads
1213

1314
RUN yarn install --frozen-lockfile --network-timeout 600000
1415
RUN yarn build

docker-compose.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ services:
2222
- 3001:3001
2323
volumes:
2424
- backend-dbstore:/home/perplexica/data
25+
- uploads:/home/perplexica/uploads
2526
- ./config.toml:/home/perplexica/config.toml
2627
extra_hosts:
2728
- 'host.docker.internal:host-gateway'
@@ -50,3 +51,4 @@ networks:
5051

5152
volumes:
5253
backend-dbstore:
54+
uploads:

package.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
"scripts": {
77
"start": "npm run db:push && node dist/app.js",
88
"build": "tsc",
9-
"dev": "nodemon src/app.ts",
9+
"dev": "nodemon --ignore uploads/ src/app.ts ",
1010
"db:push": "drizzle-kit push sqlite",
1111
"format": "prettier . --check",
1212
"format:write": "prettier . --write"
@@ -16,6 +16,7 @@
1616
"@types/cors": "^2.8.17",
1717
"@types/express": "^4.17.21",
1818
"@types/html-to-text": "^9.0.4",
19+
"@types/multer": "^1.4.12",
1920
"@types/pdf-parse": "^1.1.4",
2021
"@types/readable-stream": "^4.0.11",
2122
"@types/ws": "^8.5.12",
@@ -41,6 +42,8 @@
4142
"express": "^4.19.2",
4243
"html-to-text": "^9.0.5",
4344
"langchain": "^0.1.30",
45+
"mammoth": "^1.8.0",
46+
"multer": "^1.4.5-lts.1",
4447
"pdf-parse": "^1.1.1",
4548
"winston": "^3.13.0",
4649
"ws": "^8.17.1",

src/agents/webSearchAgent.ts

Lines changed: 84 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,12 @@ import eventEmitter from 'events';
2020
import computeSimilarity from '../utils/computeSimilarity';
2121
import logger from '../utils/logger';
2222
import LineListOutputParser from '../lib/outputParsers/listLineOutputParser';
23-
import { getDocumentsFromLinks } from '../lib/linkDocument';
2423
import LineOutputParser from '../lib/outputParsers/lineOutputParser';
2524
import { IterableReadableStream } from '@langchain/core/utils/stream';
2625
import { ChatOpenAI } from '@langchain/openai';
26+
import path from 'path';
27+
import fs from 'fs';
28+
import { getDocumentsFromLinks } from '../utils/documents';
2729

2830
const basicSearchRetrieverPrompt = `
2931
You are an AI question rephraser. You will be given a conversation and a follow-up question, you will have to rephrase the follow up question so it is a standalone question and can be used by another LLM to search the web for information to answer it.
@@ -316,6 +318,7 @@ const createBasicWebSearchAnsweringChain = (
316318
llm: BaseChatModel,
317319
embeddings: Embeddings,
318320
optimizationMode: 'speed' | 'balanced' | 'quality',
321+
fileIds: string[],
319322
) => {
320323
const basicWebSearchRetrieverChain = createBasicWebSearchRetrieverChain(llm);
321324

@@ -336,16 +339,76 @@ const createBasicWebSearchAnsweringChain = (
336339
return docs;
337340
}
338341

342+
const filesData = fileIds
343+
.map((file) => {
344+
const filePath = path.join(process.cwd(), 'uploads', file);
345+
346+
const contentPath = filePath + '-extracted.json';
347+
const embeddingsPath = filePath + '-embeddings.json';
348+
349+
const content = JSON.parse(fs.readFileSync(contentPath, 'utf8'));
350+
const embeddings = JSON.parse(fs.readFileSync(embeddingsPath, 'utf8'));
351+
352+
const fileSimilaritySearchObject = content.contents.map(
353+
(c: string, i) => {
354+
return {
355+
fileName: content.title,
356+
content: c,
357+
embeddings: embeddings.embeddings[i],
358+
};
359+
},
360+
);
361+
362+
return fileSimilaritySearchObject;
363+
})
364+
.flat();
365+
339366
if (query.toLocaleLowerCase() === 'summarize') {
340-
return docs.slice(0, 15)
367+
return docs.slice(0, 15);
341368
}
342369

343370
const docsWithContent = docs.filter(
344371
(doc) => doc.pageContent && doc.pageContent.length > 0,
345372
);
346373

347374
if (optimizationMode === 'speed') {
348-
return docsWithContent.slice(0, 15);
375+
if (filesData.length > 0) {
376+
const [queryEmbedding] = await Promise.all([
377+
embeddings.embedQuery(query),
378+
]);
379+
380+
const fileDocs = filesData.map((fileData) => {
381+
return new Document({
382+
pageContent: fileData.content,
383+
metadata: {
384+
title: fileData.fileName,
385+
url: `File`,
386+
},
387+
});
388+
});
389+
390+
const similarity = filesData.map((fileData, i) => {
391+
const sim = computeSimilarity(queryEmbedding, fileData.embeddings);
392+
393+
return {
394+
index: i,
395+
similarity: sim,
396+
};
397+
});
398+
399+
const sortedDocs = similarity
400+
.filter((sim) => sim.similarity > 0.3)
401+
.sort((a, b) => b.similarity - a.similarity)
402+
.slice(0, 8)
403+
.map((sim) => fileDocs[sim.index]);
404+
405+
return [
406+
...sortedDocs,
407+
...docsWithContent.slice(0, 15 - sortedDocs.length),
408+
];
409+
} else {
410+
return docsWithContent.slice(0, 15);
411+
}
349412
} else if (optimizationMode === 'balanced') {
350413
const [docEmbeddings, queryEmbedding] = await Promise.all([
351414
embeddings.embedDocuments(
@@ -354,6 +417,20 @@ const createBasicWebSearchAnsweringChain = (
354417
embeddings.embedQuery(query),
355418
]);
356419

420+
docsWithContent.push(
421+
...filesData.map((fileData) => {
422+
return new Document({
423+
pageContent: fileData.content,
424+
metadata: {
425+
title: fileData.fileName,
426+
url: `File`,
427+
},
428+
});
429+
}),
430+
);
431+
432+
docEmbeddings.push(...filesData.map((fileData) => fileData.embeddings));
433+
357434
const similarity = docEmbeddings.map((docEmbedding, i) => {
358435
const sim = computeSimilarity(queryEmbedding, docEmbedding);
359436

@@ -408,6 +485,7 @@ const basicWebSearch = (
408485
llm: BaseChatModel,
409486
embeddings: Embeddings,
410487
optimizationMode: 'speed' | 'balanced' | 'quality',
488+
fileIds: string[],
411489
) => {
412490
const emitter = new eventEmitter();
413491

@@ -416,6 +494,7 @@ const basicWebSearch = (
416494
llm,
417495
embeddings,
418496
optimizationMode,
497+
fileIds,
419498
);
420499

421500
const stream = basicWebSearchAnsweringChain.streamEvents(
@@ -446,13 +525,15 @@ const handleWebSearch = (
446525
llm: BaseChatModel,
447526
embeddings: Embeddings,
448527
optimizationMode: 'speed' | 'balanced' | 'quality',
528+
fileIds: string[],
449529
) => {
450530
const emitter = basicWebSearch(
451531
message,
452532
history,
453533
llm,
454534
embeddings,
455535
optimizationMode,
536+
fileIds,
456537
);
457538
return emitter;
458539
};

src/db/schema.ts

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import { sql } from 'drizzle-orm';
12
import { text, integer, sqliteTable } from 'drizzle-orm/sqlite-core';
23

34
export const messages = sqliteTable('messages', {
@@ -11,9 +12,17 @@ export const messages = sqliteTable('messages', {
1112
}),
1213
});
1314

15+
interface File {
16+
name: string;
17+
fileId: string;
18+
}
19+
1420
export const chats = sqliteTable('chats', {
1521
id: text('id').primaryKey(),
1622
title: text('title').notNull(),
1723
createdAt: text('createdAt').notNull(),
1824
focusMode: text('focusMode').notNull(),
25+
files: text('files', { mode: 'json' })
26+
.$type<File[]>()
27+
.default(sql`'[]'`),
1928
});

src/lib/providers/ollama.ts

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ import { ChatOllama } from '@langchain/community/chat_models/ollama';
66
export const loadOllamaChatModels = async () => {
77
const ollamaEndpoint = getOllamaApiEndpoint();
88
const keepAlive = getKeepAlive();
9-
9+
1010
if (!ollamaEndpoint) return {};
1111

1212
try {
@@ -25,7 +25,7 @@ export const loadOllamaChatModels = async () => {
2525
baseUrl: ollamaEndpoint,
2626
model: model.model,
2727
temperature: 0.7,
28-
keepAlive: keepAlive
28+
keepAlive: keepAlive,
2929
}),
3030
};
3131

src/routes/index.ts

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ import suggestionsRouter from './suggestions';
77
import chatsRouter from './chats';
88
import searchRouter from './search';
99
import discoverRouter from './discover';
10+
import uploadsRouter from './uploads';
1011

1112
const router = express.Router();
1213

@@ -18,5 +19,6 @@ router.use('/suggestions', suggestionsRouter);
1819
router.use('/chats', chatsRouter);
1920
router.use('/search', searchRouter);
2021
router.use('/discover', discoverRouter);
22+
router.use('/uploads', uploadsRouter);
2123

2224
export default router;

0 commit comments

Comments
 (0)