augcog · Catrunaround · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024 · Jul 6, 2024
diff --git a/rag/.gitignore b/rag/.gitignore
@@ -6,7 +6,7 @@
 /scraper/Scrape_header/*/
 /scraper/Scrape_rst/*/
 /scraper/Scrape_md/*/
-/scraper/Scrape_pdf/*/
+# /scraper/Scrape_pdf/*/
 
 # Remove all videos and audio files
 *mp4

diff --git a/rag/file_conversion_router/conversion/pdf_converter.py b/rag/file_conversion_router/conversion/pdf_converter.py
@@ -1,12 +1,16 @@
 import subprocess
 from pathlib import Path
+import os
 
 from rag.file_conversion_router.conversion.base_converter import BaseConverter
 from rag.file_conversion_router.utils.hardware_detection import detect_gpu_setup
 from rag.file_conversion_router.classes.page import Page
 from rag.file_conversion_router.classes.chunk import Chunk
 import yaml
 
+from rag.scraper.Scrape_pdf.pdf_helper import generate_mmd_file_path
+from rag.scraper.Scrape_pdf.Scrape_pdf import process_pdf, pdf_to_md
+
 class PdfConverter(BaseConverter):
     def __init__(self, model_tag: str = "0.1.0-small", batch_size: int = 4):
         super().__init__()
@@ -32,26 +36,34 @@ def _validate_parameters(self):
 
     # Override
     def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
+        input_pdf = str(input_path)
+        folder_name =  str(output_path.parent)
+        output_pdf = process_pdf(input_pdf)
+
         # """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration."""
-        # command = [
-        #     "nougat",
-        #     str(input_path),
-        #     # nougat requires the argument output path to be a directory, not file, so we need to handle it here
-        #     "-o",
-        #     str(output_path.parent),
-        #     "--no-skipping",
-        #     "--model",
-        #     self.model_tag,
-        #     "--batchsize",
-        #     str(self.batch_size),
-        # ]
+        command = [
+            "nougat",
+            str(input_path),
+            # nougat requires the argument output path to be a directory, not file, so we need to handle it here
+            "-o",
+            str(output_path.parent),
+            "--no-skipping",
+            "--model",
+            self.model_tag,
+            "--batchsize",
+            str(self.batch_size),
+        ]
+
+        mmd_file_path = generate_mmd_file_path(folder_name)
+        pdf_to_md(output_pdf, folder_name, mmd_file_path)
         # try:
         #     result = subprocess.run(command, check=False, capture_output=True, text=True)
         #     self._logger.info(f"Output: {result.stdout}")
         #     self._logger.info(f"Errors: {result.stderr}")
         #     if result.returncode != 0:
         #         self._logger.error(f"Command exited with a non-zero status: {result.returncode}")
         #     # Now change the file name of generated mmd file to align with the expected md file path from base converter
+        output_path = Path(mmd_file_path)
         output_mmd_path = output_path.with_suffix(".mmd")
         # Rename it to `md` file
         target = output_path.with_suffix(".md")
@@ -79,4 +91,5 @@ def _to_page(self, input_path: Path, output_path: Path) -> Page:
             metadata_content = yaml.safe_load(metadata_file)
         url = metadata_content.get("URL", None)
         return Page(content={'text': text}, filetype=filetype, page_url=url)
-
+
+
diff --git a/rag/requirements.txt b/rag/requirements.txt
@@ -23,4 +23,7 @@ tokenizers==0.15.2
 torch==2.3.0
 tqdm==4.66.2
 transformers==4.38.2
+pymupdf==1.24.5
+pymupdfb==1.24.3
 voyageai==0.2.2
+pix2text==1.1.1
diff --git a/rag/scraper/Scrape_pdf/README.md b/rag/scraper/Scrape_pdf/README.md
@@ -1,13 +1,12 @@
 # Scrape_pdf  
 First we will need to convert the pdf into a markdown format. We will use a tool called nougat.
 - run `pip install nougat-ocr` to install nougat
-- Go to `nougat.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
-  ```
-  pdf_to_md('~/Downloads/MLS.pdf', 'textbook')
+- Go to `Scrape_pdf.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
+- change the path in Scrape_pdf.py to your file path and run  
+
   ```
 - After you get your markdown folder now run `header.py` to segment the contents of the markdown file into headers and contents.
   ```
   # TODO
-  parser = MarkdownParser('textbook/MLS.mmd')
   ```
-- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
+- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
diff --git a/rag/scraper/Scrape_pdf/Scrape_pdf.py b/rag/scraper/Scrape_pdf/Scrape_pdf.py
@@ -0,0 +1,59 @@
+from nougat import pdf_to_md
+import fitz
+from pathlib import Path
+from pdf_helper import missing_page_fill
+import os
+
+
+def add_remove_image_suffix(pdf_path: str) -> str:
+    """Add '_remove_image' suffix to a PDF file path while keeping the .pdf extension."""
+    input_path = Path(pdf_path)
+    if input_path.suffix.lower() != '.pdf':
+        raise ValueError("The input file must be a PDF.")
+
+    # Add the suffix before the file extension
+    new_path = input_path.with_stem(input_path.stem + "_remove_image")
+
+    return str(new_path)
+
+def remove_images(input_pdf, output_pdf):
+    doc = fitz.open(input_pdf)
+
+    # Iterate through the pages
+    for page_num in range(len(doc)):
+        page = doc.load_page(page_num)
+        images = page.get_images(full=True)
+
+        # Remove each image
+        for img_index in range(len(images) - 1, -1, -1):
+            xref = images[img_index][0]
+            page.delete_image(xref)
+
+        # Optionally clean up empty spaces
+        page.clean_contents()
+
+    # Save the modified PDF
+    doc.save(output_pdf)
+    doc.close()
+
+def process_pdf(input_pdf: str) -> str:
+    """Process the PDF by removing images and saving to a new file with '_remove_image' suffix."""
+    output_pdf = add_remove_image_suffix(input_pdf)
+    remove_images(input_pdf, output_pdf)
+    return output_pdf
+
+def generate_mmd_file_path(folder_path):
+    folder_name = os.path.basename(folder_path)
+    mmd_file_path = os.path.join(folder_path, f"{folder_name}_remove_image.mmd")
+    return mmd_file_path
+
+
+input_pdf = "rag\scraper\Scrape_pdf\example_pdf.pdf"
+folder_name = "rag\scraper\Scrape_pdf\example_pdf"
+
+
+if __name__ == '__main__':
+    output_pdf = process_pdf(input_pdf)
+    pdf_to_md(output_pdf, folder_name)
+    md_file_path = generate_mmd_file_path(folder_name)
+    missing_page_fill(output_pdf, folder_name, md_file_path)
diff --git a/rag/scraper/Scrape_pdf/__init__.py b/rag/scraper/Scrape_pdf/__init__.py
diff --git a/rag/scraper/Scrape_pdf/example_pdf.pdf b/rag/scraper/Scrape_pdf/example_pdf.pdf
diff --git a/rag/scraper/Scrape_pdf/nougat.py b/rag/scraper/Scrape_pdf/nougat.py
@@ -1,7 +1,7 @@
 import os
 def pdf_to_md(pdf_file_path, folder_name):
     # Command to execute
-    command = f"nougat {pdf_file_path} -o {folder_name}"
+    command = f"nougat {pdf_file_path} -o {folder_name} -m 0.1.0-base --no-skipping"
     # Run the command
     os.system(command)
 

diff --git a/rag/scraper/Scrape_pdf/pdf_helper.py b/rag/scraper/Scrape_pdf/pdf_helper.py
@@ -0,0 +1,94 @@
+import os
+import re
+import fitz  
+from pix2text import Pix2Text
+
+
+def convert_pdf_to_markdown(pdf_file_path, output_file_path, page_numbers=None):
+    """
+    Convert a PDF file to Markdown format.
+
+    Parameters:
+    pdf_file_path (str): The file path of the input PDF.
+    output_file_path (str): The file path where the output Markdown will be saved.
+    page_numbers (list of int, optional): List of page numbers to process. Defaults to None (process all pages).
+    """
+    try:
+        # Initialize Pix2Text with default configuration
+        p2t = Pix2Text.from_config()
+
+        # Recognize text in the PDF
+        doc = p2t.recognize_pdf(pdf_file_path, page_numbers=page_numbers)
+
+        # Save the recognized text to a Markdown file
+        doc.to_markdown(output_file_path)
+
+        print(f"Markdown saved to {output_file_path}")
+    except Exception as e:
+        print(f"An error occurred: {e}")
+
+def extract_and_convert_pdf_to_md(pdf_path, md_path, output_folder):
+    # Open the PDF document
+    pdf_document = fitz.open(pdf_path)
+
+    # Check if the Markdown file exists
+    if not os.path.exists(md_path):
+        print(f"Markdown file does not exist: {md_path}")
+        return
+
+    # Read the existing Markdown content
+    with open(md_path, 'r', encoding='utf-8') as md_file:
+        markdown_content = md_file.read()
+
+    # Match all forms of MISSING_PAGE markers
+    missing_pages = re.findall(r'\[MISSING_PAGE.*?:(\d+)\]', markdown_content)
+
+    # Extract missing pages as separate PDF files
+    for page_number in missing_pages:
+        page_index = int(page_number) - 1
+        page = pdf_document.load_page(page_index)
+        single_page_pdf_path = os.path.join(output_folder, f"page_{page_number}.pdf")
+        single_page_document = fitz.open()
+        single_page_document.insert_pdf(pdf_document, from_page=page_index, to_page=page_index)
+        single_page_document.save(single_page_pdf_path)
+
+        # Run Nougat on the single page PDF
+        single_page_output_folder = os.path.join(output_folder, f"page_{page_number}_output")
+        if not os.path.exists(single_page_output_folder):
+            os.makedirs(single_page_output_folder)
+        convert_pdf_to_markdown(single_page_pdf_path, single_page_output_folder)
+
+        # Read the generated Markdown content for this page
+        single_page_md_files = os.listdir(single_page_output_folder)
+        if not single_page_md_files:
+            print(f"No Markdown file generated for page {page_number}")
+            continue
+
+        single_page_md_path = os.path.join(single_page_output_folder, single_page_md_files[0])
+        with open(single_page_md_path, 'r', encoding='utf-8') as single_page_md_file:
+            single_page_md_content = single_page_md_file.read()
+
+        # Escape backslashes in single_page_md_content
+        single_page_md_content = single_page_md_content.replace('\\', '\\\\')
+
+        # Replace the missing page marker with the actual content
+        markdown_content = re.sub(rf'\[MISSING_PAGE.*?:{page_number}\]', single_page_md_content, markdown_content)
+
+    pdf_document.close()
+
+    # Save the updated Markdown content
+    with open(md_path, 'w', encoding='utf-8') as md_file:
+        md_file.write(markdown_content)
+
+def missing_page_fill(pdf_file_path, folder_name, md_file_path):
+    # Ensure the output folder exists
+    if not os.path.exists(folder_name):
+        os.makedirs(folder_name)
+
+    # Extract missing pages and update Markdown content
+    extract_and_convert_pdf_to_md(pdf_file_path, md_file_path, folder_name)
+
+def generate_mmd_file_path(folder_path):
+    folder_name = os.path.basename(folder_path)
+    mmd_file_path = os.path.join(folder_path, f"{folder_name}_remove_image.mmd")
+    return mmd_file_path