fix: able to read more pages of PDF (#247) (#248)

HansGabriel · Dec 10, 2023 · 83023b6 · 83023b6
1 parent 45fb5d7
commit 83023b6
Show file tree

Hide file tree

Showing 4 changed files with 119 additions and 49 deletions.
diff --git a/apps/expo/src/screens/create-reviewer/index.tsx b/apps/expo/src/screens/create-reviewer/index.tsx
@@ -53,7 +53,6 @@ interface CacheOptions {
 
 interface FilePickerType {
   fileType: string;
-  base64ContentType: string;
 }
 
 export const CreateReviewerScreen = ({
@@ -222,10 +221,7 @@ export const CreateReviewerScreen = ({
     return cacheFilePath;
   };
 
-  const filePicker = async ({
-    fileType,
-    base64ContentType,
-  }: FilePickerType) => {
+  const filePicker = async ({ fileType }: FilePickerType) => {
     const result = await DocumentPicker.getDocumentAsync({
       type: fileType,
       copyToCacheDirectory: false,
@@ -249,16 +245,15 @@ export const CreateReviewerScreen = ({
       });
 
       if (base64) {
-        const formatBase64 = `data:${base64ContentType};base64,${base64}`;
         readFile(
           {
-            file: formatBase64,
-            fileType: `${fileType === "image/*" ? "JPG" : "PDF"}`,
+            file: base64,
+            fileType: fileType,
           },
           {
             onSuccess: (data) => {
               if (richText.current) {
-                richText.current.insertHTML(data.text);
+                richText.current.insertHTML(data);
               }
               successToast({
                 title: "Success",
@@ -676,7 +671,6 @@ export const CreateReviewerScreen = ({
             onPress={() =>
               filePicker({
                 fileType: "application/pdf",
-                base64ContentType: "application/pdf",
               })
             }
           />
@@ -694,7 +688,6 @@ export const CreateReviewerScreen = ({
             onPress={() =>
               filePicker({
                 fileType: "image/*",
-                base64ContentType: "image/jpg",
               })
             }
           />

diff --git a/packages/api/package.json b/packages/api/package.json
@@ -21,6 +21,7 @@
     "algoliasearch": "^4.20.0",
     "dotenv": "^16.3.1",
     "p-map": "^6.0.0",
+    "pdf-lib": "^1.17.1",
     "pdf-parse": "^1.1.1",
     "pdfkit": "^0.14.0",
     "shutterstock-api": "^1.1.35",

diff --git a/packages/api/src/router/pdfTextExtraction.ts b/packages/api/src/router/pdfTextExtraction.ts
@@ -1,9 +1,78 @@
 /* eslint-disable @typescript-eslint/no-explicit-any */
 import { router, protectedProcedure } from "../trpc";
 import { z } from "zod";
+import { PDFDocument } from "pdf-lib";
 
-type OCRResult = {
-  text: string;
+const apiKey = process.env.OCR_API;
+const apiEndpoint = "https://api.ocr.space/parse/image";
+
+const splitPDF = async (base64: string) => {
+  const pdfBytes = Uint8Array.from(atob(base64), (c) => c.charCodeAt(0));
+  const pdfDoc = await PDFDocument.load(pdfBytes);
+  const pages = pdfDoc.getPages();
+  const chunks = [];
+
+  for (let i = 0; i < pages.length; i += 3) {
+    chunks.push(pages.slice(i, i + 3));
+  }
+
+  const base64Array = [];
+  for (const chunk of chunks) {
+    const newPdfDoc = await PDFDocument.create();
+    const chunkIndices = chunk.map((page) => pdfDoc.getPages().indexOf(page));
+    const copiedPages = await newPdfDoc.copyPages(pdfDoc, chunkIndices);
+    for (const copiedPage of copiedPages) {
+      newPdfDoc.addPage(copiedPage);
+    }
+    const newPdfBytes = await newPdfDoc.save();
+    const newPdfBase64 = Buffer.from(newPdfBytes).toString("base64");
+    base64Array.push(`data:application/pdf;base64,${newPdfBase64}`);
+  }
+
+  return base64Array;
+};
+
+const createFormData = (file: string, fileType: string) => {
+  const formData = new FormData();
+  formData.append("base64image", file);
+  formData.append("filetype", fileType);
+  formData.append("scale", "true");
+  formData.append("isTable", "true");
+  formData.append("OCREngine", "2");
+  if (apiKey) {
+    formData.append("apikey", apiKey);
+  }
+  return formData;
+};
+
+const fetchOCR = async (formData: FormData) => {
+  const response = await fetch(apiEndpoint, {
+    method: "POST",
+    body: formData,
+  });
+
+  const data = await response.json();
+  if (data.IsErroredOnProcessing) {
+    throw new Error("Data error on processing");
+  }
+
+  if (!response.ok) {
+    throw new Error("OCR request failed");
+  }
+  return data;
+};
+
+const processOcrResult = (data: any) => {
+  let combinedText = "";
+  if (data.ParsedResults.length > 1) {
+    data.ParsedResults.forEach((item: any) => {
+      combinedText += item.ParsedText + "\n";
+    });
+  } else {
+    combinedText = data.ParsedResults?.[0]?.ParsedText;
+  }
+
+  return combinedText;
 };
 
 export const textExtractionRouter = router({
@@ -16,45 +85,24 @@ export const textExtractionRouter = router({
     )
     .mutation(async ({ input }) => {
       const { file, fileType } = input;
+      let ocrResult = "";
 
-      const apiKey = process.env.OCR_API;
-      const apiEndpoint = "https://api.ocr.space/parse/image";
-
-      const formData = new FormData();
-      formData.append("base64image", file);
-      formData.append("filetype", fileType);
-      formData.append("OCREngine", "2");
-      formData.append("detectOrientation", "true");
-      formData.append("scale", "true");
-      if (apiKey) {
-        formData.append("apikey", apiKey);
-      }
-
-      const response = await fetch(apiEndpoint, {
-        method: "POST",
-        body: formData,
-      });
-
-      if (!response.ok) {
-        throw new Error("OCR request failed");
-      }
-
-      const data = await response.json();
-      if (data.IsErroredOnProcessing) {
-        throw new Error("OCR request failed");
-      }
-      let combinedText = "";
-      if (data.ParsedResults.length > 1) {
-        data.ParsedResults.forEach((item: any) => {
-          combinedText += item.ParsedText + "\n";
-        });
+      if (fileType === "image/*") {
+        const imageFormat = `data:image/jpg;base64,${file}`;
+        const imageFormData = createFormData(imageFormat, "JPG");
+        const imageData = await fetchOCR(imageFormData);
+        ocrResult += " " + processOcrResult(imageData);
       } else {
-        combinedText = data.ParsedResults?.[0]?.ParsedText;
-      }
+        const pdfDocuments = await splitPDF(file);
 
-      const ocrResult: OCRResult = {
-        text: combinedText,
-      };
+        for (let i = 0; i < pdfDocuments.length; i++) {
+          if (pdfDocuments[i]) {
+            const pdfFormData = createFormData(pdfDocuments[i] ?? "", fileType);
+            const pdfData = await fetchOCR(pdfFormData);
+            ocrResult += " " + processOcrResult(pdfData);
+          }
+        }
+      }
 
       return ocrResult;
     }),

diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml