From 83023b63bac3af72622e7e87ffcddadfba8aa05f Mon Sep 17 00:00:00 2001 From: Edmel John Linaugo <68092712+EdmelKun@users.noreply.github.com> Date: Sun, 10 Dec 2023 15:19:08 +0800 Subject: [PATCH] fix: able to read more pages of PDF (#247) (#248) --- .../src/screens/create-reviewer/index.tsx | 15 +-- packages/api/package.json | 1 + packages/api/src/router/pdfTextExtraction.ts | 124 ++++++++++++------ pnpm-lock.yaml | 28 ++++ 4 files changed, 119 insertions(+), 49 deletions(-) diff --git a/apps/expo/src/screens/create-reviewer/index.tsx b/apps/expo/src/screens/create-reviewer/index.tsx index bd515a8..d9781ae 100644 --- a/apps/expo/src/screens/create-reviewer/index.tsx +++ b/apps/expo/src/screens/create-reviewer/index.tsx @@ -53,7 +53,6 @@ interface CacheOptions { interface FilePickerType { fileType: string; - base64ContentType: string; } export const CreateReviewerScreen = ({ @@ -222,10 +221,7 @@ export const CreateReviewerScreen = ({ return cacheFilePath; }; - const filePicker = async ({ - fileType, - base64ContentType, - }: FilePickerType) => { + const filePicker = async ({ fileType }: FilePickerType) => { const result = await DocumentPicker.getDocumentAsync({ type: fileType, copyToCacheDirectory: false, @@ -249,16 +245,15 @@ export const CreateReviewerScreen = ({ }); if (base64) { - const formatBase64 = `data:${base64ContentType};base64,${base64}`; readFile( { - file: formatBase64, - fileType: `${fileType === "image/*" ? "JPG" : "PDF"}`, + file: base64, + fileType: fileType, }, { onSuccess: (data) => { if (richText.current) { - richText.current.insertHTML(data.text); + richText.current.insertHTML(data); } successToast({ title: "Success", @@ -676,7 +671,6 @@ export const CreateReviewerScreen = ({ onPress={() => filePicker({ fileType: "application/pdf", - base64ContentType: "application/pdf", }) } /> @@ -694,7 +688,6 @@ export const CreateReviewerScreen = ({ onPress={() => filePicker({ fileType: "image/*", - base64ContentType: "image/jpg", }) } /> diff --git a/packages/api/package.json b/packages/api/package.json index ff2f873..e0437a9 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -21,6 +21,7 @@ "algoliasearch": "^4.20.0", "dotenv": "^16.3.1", "p-map": "^6.0.0", + "pdf-lib": "^1.17.1", "pdf-parse": "^1.1.1", "pdfkit": "^0.14.0", "shutterstock-api": "^1.1.35", diff --git a/packages/api/src/router/pdfTextExtraction.ts b/packages/api/src/router/pdfTextExtraction.ts index 4bf6de2..4766dd0 100644 --- a/packages/api/src/router/pdfTextExtraction.ts +++ b/packages/api/src/router/pdfTextExtraction.ts @@ -1,9 +1,78 @@ /* eslint-disable @typescript-eslint/no-explicit-any */ import { router, protectedProcedure } from "../trpc"; import { z } from "zod"; +import { PDFDocument } from "pdf-lib"; -type OCRResult = { - text: string; +const apiKey = process.env.OCR_API; +const apiEndpoint = "https://api.ocr.space/parse/image"; + +const splitPDF = async (base64: string) => { + const pdfBytes = Uint8Array.from(atob(base64), (c) => c.charCodeAt(0)); + const pdfDoc = await PDFDocument.load(pdfBytes); + const pages = pdfDoc.getPages(); + const chunks = []; + + for (let i = 0; i < pages.length; i += 3) { + chunks.push(pages.slice(i, i + 3)); + } + + const base64Array = []; + for (const chunk of chunks) { + const newPdfDoc = await PDFDocument.create(); + const chunkIndices = chunk.map((page) => pdfDoc.getPages().indexOf(page)); + const copiedPages = await newPdfDoc.copyPages(pdfDoc, chunkIndices); + for (const copiedPage of copiedPages) { + newPdfDoc.addPage(copiedPage); + } + const newPdfBytes = await newPdfDoc.save(); + const newPdfBase64 = Buffer.from(newPdfBytes).toString("base64"); + base64Array.push(`data:application/pdf;base64,${newPdfBase64}`); + } + + return base64Array; +}; + +const createFormData = (file: string, fileType: string) => { + const formData = new FormData(); + formData.append("base64image", file); + formData.append("filetype", fileType); + formData.append("scale", "true"); + formData.append("isTable", "true"); + formData.append("OCREngine", "2"); + if (apiKey) { + formData.append("apikey", apiKey); + } + return formData; +}; + +const fetchOCR = async (formData: FormData) => { + const response = await fetch(apiEndpoint, { + method: "POST", + body: formData, + }); + + const data = await response.json(); + if (data.IsErroredOnProcessing) { + throw new Error("Data error on processing"); + } + + if (!response.ok) { + throw new Error("OCR request failed"); + } + return data; +}; + +const processOcrResult = (data: any) => { + let combinedText = ""; + if (data.ParsedResults.length > 1) { + data.ParsedResults.forEach((item: any) => { + combinedText += item.ParsedText + "\n"; + }); + } else { + combinedText = data.ParsedResults?.[0]?.ParsedText; + } + + return combinedText; }; export const textExtractionRouter = router({ @@ -16,45 +85,24 @@ export const textExtractionRouter = router({ ) .mutation(async ({ input }) => { const { file, fileType } = input; + let ocrResult = ""; - const apiKey = process.env.OCR_API; - const apiEndpoint = "https://api.ocr.space/parse/image"; - - const formData = new FormData(); - formData.append("base64image", file); - formData.append("filetype", fileType); - formData.append("OCREngine", "2"); - formData.append("detectOrientation", "true"); - formData.append("scale", "true"); - if (apiKey) { - formData.append("apikey", apiKey); - } - - const response = await fetch(apiEndpoint, { - method: "POST", - body: formData, - }); - - if (!response.ok) { - throw new Error("OCR request failed"); - } - - const data = await response.json(); - if (data.IsErroredOnProcessing) { - throw new Error("OCR request failed"); - } - let combinedText = ""; - if (data.ParsedResults.length > 1) { - data.ParsedResults.forEach((item: any) => { - combinedText += item.ParsedText + "\n"; - }); + if (fileType === "image/*") { + const imageFormat = `data:image/jpg;base64,${file}`; + const imageFormData = createFormData(imageFormat, "JPG"); + const imageData = await fetchOCR(imageFormData); + ocrResult += " " + processOcrResult(imageData); } else { - combinedText = data.ParsedResults?.[0]?.ParsedText; - } + const pdfDocuments = await splitPDF(file); - const ocrResult: OCRResult = { - text: combinedText, - }; + for (let i = 0; i < pdfDocuments.length; i++) { + if (pdfDocuments[i]) { + const pdfFormData = createFormData(pdfDocuments[i] ?? "", fileType); + const pdfData = await fetchOCR(pdfFormData); + ocrResult += " " + processOcrResult(pdfData); + } + } + } return ocrResult; }), diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1f0a65a..353720d 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -427,6 +427,9 @@ importers: p-map: specifier: ^6.0.0 version: 6.0.0 + pdf-lib: + specifier: ^1.17.1 + version: 1.17.1 pdf-parse: specifier: ^1.1.1 version: 1.1.1 @@ -3451,6 +3454,18 @@ packages: rimraf: 3.0.2 dev: false + /@pdf-lib/standard-fonts@1.0.0: + resolution: {integrity: sha512-hU30BK9IUN/su0Mn9VdlVKsWBS6GyhVfqjwl1FjZN4TxP6cCw0jP2w7V3Hf5uX7M0AZJ16vey9yE0ny7Sa59ZA==} + dependencies: + pako: 1.0.11 + dev: false + + /@pdf-lib/upng@1.0.1: + resolution: {integrity: sha512-dQK2FUMQtowVP00mtIksrlZhdFXQZPC+taih1q4CvPZ5vqdxR/LKBaFg0oAfzd1GlHZXXSPdQfzQnt+ViGvEIQ==} + dependencies: + pako: 1.0.11 + dev: false + /@peculiar/asn1-schema@2.3.8: resolution: {integrity: sha512-ULB1XqHKx1WBU/tTFIA+uARuRoBVZ4pNdOA878RDrRbBfBGcSzi5HBkdScC6ZbHn8z7L8gmKCgPC1LHRrP46tA==} dependencies: @@ -10424,6 +10439,10 @@ packages: resolution: {integrity: sha512-NUcwaKxUxWrZLpDG+z/xZaCgQITkA/Dv4V/T6bw7VON6l1Xz/VnrBqrYjZQ12TamKHzITTfOEIYUj48y2KXImA==} dev: false + /pako@1.0.11: + resolution: {integrity: sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==} + dev: false + /parent-module@1.0.1: resolution: {integrity: sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==} engines: {node: '>=6'} @@ -10528,6 +10547,15 @@ packages: resolution: {integrity: sha512-Dp6zGqpTdETdR63lehJYPeIOqpiNBNtc7BpWSLrOje7UaIsE5aY92r/AunQA7rsXvet3lrJ3JnZX29UPTKXyKQ==} dev: false + /pdf-lib@1.17.1: + resolution: {integrity: sha512-V/mpyJAoTsN4cnP31vc0wfNA1+p20evqqnap0KLoRUN0Yk/p3wN52DOEsL4oBFcLdb76hlpKPtzJIgo67j/XLw==} + dependencies: + '@pdf-lib/standard-fonts': 1.0.0 + '@pdf-lib/upng': 1.0.1 + pako: 1.0.11 + tslib: 1.14.1 + dev: false + /pdf-parse@1.1.1: resolution: {integrity: sha512-v6ZJ/efsBpGrGGknjtq9J/oC8tZWq0KWL5vQrk2GlzLEQPUDB1ex+13Rmidl1neNN358Jn9EHZw5y07FFtaC7A==} engines: {node: '>=6.8.1'}