Skip to content

Commit

Permalink
fix: able to read more pages of PDF (#247) (#248)
Browse files Browse the repository at this point in the history
  • Loading branch information
EdmelKun authored Dec 10, 2023
1 parent 45fb5d7 commit 83023b6
Show file tree
Hide file tree
Showing 4 changed files with 119 additions and 49 deletions.
15 changes: 4 additions & 11 deletions apps/expo/src/screens/create-reviewer/index.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ interface CacheOptions {

interface FilePickerType {
fileType: string;
base64ContentType: string;
}

export const CreateReviewerScreen = ({
Expand Down Expand Up @@ -222,10 +221,7 @@ export const CreateReviewerScreen = ({
return cacheFilePath;
};

const filePicker = async ({
fileType,
base64ContentType,
}: FilePickerType) => {
const filePicker = async ({ fileType }: FilePickerType) => {
const result = await DocumentPicker.getDocumentAsync({
type: fileType,
copyToCacheDirectory: false,
Expand All @@ -249,16 +245,15 @@ export const CreateReviewerScreen = ({
});

if (base64) {
const formatBase64 = `data:${base64ContentType};base64,${base64}`;
readFile(
{
file: formatBase64,
fileType: `${fileType === "image/*" ? "JPG" : "PDF"}`,
file: base64,
fileType: fileType,
},
{
onSuccess: (data) => {
if (richText.current) {
richText.current.insertHTML(data.text);
richText.current.insertHTML(data);
}
successToast({
title: "Success",
Expand Down Expand Up @@ -676,7 +671,6 @@ export const CreateReviewerScreen = ({
onPress={() =>
filePicker({
fileType: "application/pdf",
base64ContentType: "application/pdf",
})
}
/>
Expand All @@ -694,7 +688,6 @@ export const CreateReviewerScreen = ({
onPress={() =>
filePicker({
fileType: "image/*",
base64ContentType: "image/jpg",
})
}
/>
Expand Down
1 change: 1 addition & 0 deletions packages/api/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"algoliasearch": "^4.20.0",
"dotenv": "^16.3.1",
"p-map": "^6.0.0",
"pdf-lib": "^1.17.1",
"pdf-parse": "^1.1.1",
"pdfkit": "^0.14.0",
"shutterstock-api": "^1.1.35",
Expand Down
124 changes: 86 additions & 38 deletions packages/api/src/router/pdfTextExtraction.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,78 @@
/* eslint-disable @typescript-eslint/no-explicit-any */
import { router, protectedProcedure } from "../trpc";
import { z } from "zod";
import { PDFDocument } from "pdf-lib";

type OCRResult = {
text: string;
const apiKey = process.env.OCR_API;
const apiEndpoint = "https://api.ocr.space/parse/image";

const splitPDF = async (base64: string) => {
const pdfBytes = Uint8Array.from(atob(base64), (c) => c.charCodeAt(0));
const pdfDoc = await PDFDocument.load(pdfBytes);
const pages = pdfDoc.getPages();
const chunks = [];

for (let i = 0; i < pages.length; i += 3) {
chunks.push(pages.slice(i, i + 3));
}

const base64Array = [];
for (const chunk of chunks) {
const newPdfDoc = await PDFDocument.create();
const chunkIndices = chunk.map((page) => pdfDoc.getPages().indexOf(page));
const copiedPages = await newPdfDoc.copyPages(pdfDoc, chunkIndices);
for (const copiedPage of copiedPages) {
newPdfDoc.addPage(copiedPage);
}
const newPdfBytes = await newPdfDoc.save();
const newPdfBase64 = Buffer.from(newPdfBytes).toString("base64");
base64Array.push(`data:application/pdf;base64,${newPdfBase64}`);
}

return base64Array;
};

const createFormData = (file: string, fileType: string) => {
const formData = new FormData();
formData.append("base64image", file);
formData.append("filetype", fileType);
formData.append("scale", "true");
formData.append("isTable", "true");
formData.append("OCREngine", "2");
if (apiKey) {
formData.append("apikey", apiKey);
}
return formData;
};

const fetchOCR = async (formData: FormData) => {
const response = await fetch(apiEndpoint, {
method: "POST",
body: formData,
});

const data = await response.json();
if (data.IsErroredOnProcessing) {
throw new Error("Data error on processing");
}

if (!response.ok) {
throw new Error("OCR request failed");
}
return data;
};

const processOcrResult = (data: any) => {
let combinedText = "";
if (data.ParsedResults.length > 1) {
data.ParsedResults.forEach((item: any) => {
combinedText += item.ParsedText + "\n";
});
} else {
combinedText = data.ParsedResults?.[0]?.ParsedText;
}

return combinedText;
};

export const textExtractionRouter = router({
Expand All @@ -16,45 +85,24 @@ export const textExtractionRouter = router({
)
.mutation(async ({ input }) => {
const { file, fileType } = input;
let ocrResult = "";

const apiKey = process.env.OCR_API;
const apiEndpoint = "https://api.ocr.space/parse/image";

const formData = new FormData();
formData.append("base64image", file);
formData.append("filetype", fileType);
formData.append("OCREngine", "2");
formData.append("detectOrientation", "true");
formData.append("scale", "true");
if (apiKey) {
formData.append("apikey", apiKey);
}

const response = await fetch(apiEndpoint, {
method: "POST",
body: formData,
});

if (!response.ok) {
throw new Error("OCR request failed");
}

const data = await response.json();
if (data.IsErroredOnProcessing) {
throw new Error("OCR request failed");
}
let combinedText = "";
if (data.ParsedResults.length > 1) {
data.ParsedResults.forEach((item: any) => {
combinedText += item.ParsedText + "\n";
});
if (fileType === "image/*") {
const imageFormat = `data:image/jpg;base64,${file}`;
const imageFormData = createFormData(imageFormat, "JPG");
const imageData = await fetchOCR(imageFormData);
ocrResult += " " + processOcrResult(imageData);
} else {
combinedText = data.ParsedResults?.[0]?.ParsedText;
}
const pdfDocuments = await splitPDF(file);

const ocrResult: OCRResult = {
text: combinedText,
};
for (let i = 0; i < pdfDocuments.length; i++) {
if (pdfDocuments[i]) {
const pdfFormData = createFormData(pdfDocuments[i] ?? "", fileType);
const pdfData = await fetchOCR(pdfFormData);
ocrResult += " " + processOcrResult(pdfData);
}
}
}

return ocrResult;
}),
Expand Down
28 changes: 28 additions & 0 deletions pnpm-lock.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit 83023b6

Please sign in to comment.