Yikang/new convsersion (#56)

* commiting scraper_pdf folder. remove ignore for now * YIkang's scrape pdf * edited router * edited nougat * readme * added requirements * added pix2text * fix api question * try new functing * fix compare question * update pdf_scrape readme * fixed fialed and passed test * updata readme and passed api test * modified api * ignore pdf file while comparing * ignore extra pdf while comparing * remove api main func * move funcs in pdf converter class
augcog · Jul 16, 2024 · 6a2875e · 6a2875e
1 parent ea2b52e
commit 6a2875e
Show file tree

Hide file tree

Showing 8 changed files with 141 additions and 15 deletions.
diff --git a/rag/.gitignore b/rag/.gitignore
@@ -6,7 +6,7 @@
 /scraper/Scrape_header/*/
 /scraper/Scrape_rst/*/
 /scraper/Scrape_md/*/
-/scraper/Scrape_pdf/*/
+# /scraper/Scrape_pdf/*/
 
 # Remove all videos and audio files
 *mp4

diff --git a/rag/file_conversion_router/api.py b/rag/file_conversion_router/api.py
@@ -15,3 +15,4 @@ def convert_directory(input_dir: Union[str, Path], output_dir: Union[str, Path])
     2. Markdown (To clarify, this markdown includes additional tree structure of original markdown file)
     """
     process_folder(input_dir, output_dir)
+
diff --git a/rag/file_conversion_router/conversion/pdf_converter.py b/rag/file_conversion_router/conversion/pdf_converter.py
@@ -1,12 +1,18 @@
 import subprocess
 from pathlib import Path
+import os
+import fitz
+import re
+from pix2text import Pix2Text
+
 
 from rag.file_conversion_router.conversion.base_converter import BaseConverter
 from rag.file_conversion_router.utils.hardware_detection import detect_gpu_setup
 from rag.file_conversion_router.classes.page import Page
 from rag.file_conversion_router.classes.chunk import Chunk
 import yaml
 
+
 class PdfConverter(BaseConverter):
     def __init__(self, model_tag: str = "0.1.0-small", batch_size: int = 4):
         super().__init__()
@@ -29,21 +35,126 @@ def _validate_parameters(self):
         acceptable_models = ["0.1.0-small", "0.1.0-base"]
         if self.model_tag not in acceptable_models:
             raise ValueError(f"Model tag must be one of {acceptable_models}")
+
+    def convert_pdf_to_markdown(self, pdf_file_path, output_file_path, page_numbers=None):
+    # """
+    # Convert a PDF file to Markdown format.
+
+    # Parameters:
+    # pdf_file_path (str): The file path of the input PDF.
+    # output_file_path (str): The file path where the output Markdown will be saved.
+    # page_numbers (list of int, optional): List of page numbers to process. Defaults to None (process all pages).
+    # """
+        try:
+            # Initialize Pix2Text with default configuration
+            p2t = Pix2Text.from_config()
+
+            # Recognize text in the PDF
+            doc = p2t.recognize_pdf(pdf_file_path, page_numbers=page_numbers)
+
+            # Save the recognized text to a Markdown file
+            doc.to_markdown(output_file_path)
+
+            print(f"Markdown saved to {output_file_path}")
+        except Exception as e:
+            print(f"An error occurred: {e}")
+
+    def remove_images_from_pdf(self, input_path: Path, output_path: Path):
+        pdf_document = fitz.open(input_path)
+
+        for page_num in range(len(pdf_document)):
+            page = pdf_document.load_page(page_num)
+            images = page.get_images(full=True)
+
+            # Remove each image
+            for img_index in range(len(images) - 1, -1, -1):
+                xref = images[img_index][0]
+                page.delete_image(xref)
+
+            # Optionally clean up empty spaces
+            page.clean_contents()
+        pdf_document.save(output_path)
+        pdf_document.close()
+
+    def extract_and_convert_pdf_to_md(self, pdf_path, md_path, output_folder):
+    # Open the PDF document
+        pdf_document = fitz.open(pdf_path)
+
+        # Check if the Markdown file exists
+        if not os.path.exists(md_path):
+            print(f"Markdown file does not exist: {md_path}")
+            return
+
+        # Read the existing Markdown content
+        with open(md_path, 'r', encoding='utf-8') as md_file:
+            markdown_content = md_file.read()
+
+        # Match all forms of MISSING_PAGE markers
+        missing_pages = re.findall(r'\[MISSING_PAGE.*?:(\d+)\]', markdown_content)
+
+        # Extract missing pages as separate PDF files
+        for page_number in missing_pages:
+            page_index = int(page_number) - 1
+            page = pdf_document.load_page(page_index)
+            single_page_pdf_path = os.path.join(output_folder, f"page_{page_number}.pdf")
+            single_page_document = fitz.open()
+            single_page_document.insert_pdf(pdf_document, from_page=page_index, to_page=page_index)
+            single_page_document.save(single_page_pdf_path)
+
+            # Run Nougat on the single page PDF
+            single_page_output_folder = os.path.join(output_folder, f"page_{page_number}_output")
+            if not os.path.exists(single_page_output_folder):
+                os.makedirs(single_page_output_folder)
+            self.convert_pdf_to_markdown(single_page_pdf_path, single_page_output_folder)
+
+            # Read the generated Markdown content for this page
+            single_page_md_files = os.listdir(single_page_output_folder)
+            if not single_page_md_files:
+                print(f"No Markdown file generated for page {page_number}")
+                continue
+
+            single_page_md_path = os.path.join(single_page_output_folder, single_page_md_files[0])
+            with open(single_page_md_path, 'r', encoding='utf-8') as single_page_md_file:
+                single_page_md_content = single_page_md_file.read()
+
+            # Escape backslashes in single_page_md_content
+            single_page_md_content = single_page_md_content.replace('\\', '\\\\')
+
+            # Replace the missing page marker with the actual content
+            markdown_content = re.sub(rf'\[MISSING_PAGE.*?:{page_number}\]', single_page_md_content, markdown_content)
+
+        pdf_document.close()
+
+        # Save the updated Markdown content
+        with open(md_path, 'w', encoding='utf-8') as md_file:
+            md_file.write(markdown_content)
 
     # Override
     def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
         # """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration."""
+        temp_dir_path = output_path.parent
+
+        # Create the directory if it doesn't exist
+        if not temp_dir_path.exists():
+            os.makedirs(temp_dir_path)
+
+        # Define the path for the PDF without images in the output directory
+        pdf_without_images_path = temp_dir_path / input_path.name
+
+        # Remove images from the PDF and save to the output directory
+        self.remove_images_from_pdf(input_path, pdf_without_images_path)
+
         command = [
             "nougat",
-            str(input_path),
+            str(pdf_without_images_path),
             # nougat requires the argument output path to be a directory, not file, so we need to handle it here
             "-o",
             str(output_path.parent),
             "--no-skipping",
             "--model",
             self.model_tag,
-            "--batchsize",
-            str(self.batch_size),
+            # "--batchsize",
+            # str(self.batch_size),
         ]
         try:
             result = subprocess.run(command, check=False, capture_output=True, text=True)
@@ -54,9 +165,10 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
         except Exception as e:
             self._logger.error(f"An error occurred: {str(e)}")
             raise
-
+        
         # Now change the file name of generated mmd file to align with the expected md file path from base converter
         output_mmd_path = output_path.with_suffix(".mmd")
+        self.extract_and_convert_pdf_to_md(str(pdf_without_images_path), str(output_mmd_path), str(temp_dir_path))
         # Rename it to `md` file
         target = output_path.with_suffix(".md")
         output_mmd_path.rename(target)
@@ -70,7 +182,7 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
     def _to_page(self, input_path: Path, output_path: Path) -> Page:
         """Perform Markdown to Page conversion."""
         try:
-            input_path = self._to_markdown(input_path, output_path)
+            input_path = self._to_markdown(input_path, output_path,)
         except Exception as e:
             self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
             raise
@@ -81,7 +193,9 @@ def _to_page(self, input_path: Path, output_path: Path) -> Page:
         with open(input_path, "r") as input_file:
             text = input_file.read()
 
+
         metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
         metadata_content = self._read_metadata(metadata_path)
         url = metadata_content.get("URL")
         return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
+
diff --git a/rag/requirements.txt b/rag/requirements.txt
@@ -23,5 +23,9 @@ tokenizers==0.15.2
 torch==2.3.0
 tqdm==4.66.2
 transformers==4.38.2
+pymupdf==1.24.5
+pymupdfb==1.24.3
 voyageai==0.2.2
+pix2text==1.1.1
 rst_to_myst==0.4.0
+
diff --git a/rag/scraper/Scrape_pdf/README.md b/rag/scraper/Scrape_pdf/README.md
@@ -1,13 +1,14 @@
 # Scrape_pdf  
-First we will need to convert the pdf into a markdown format. We will use a tool called nougat.
+First we will need to convert the pdf into a markdown format. We will use two tools called nougat and pix2text.
 - run `pip install nougat-ocr` to install nougat
-- Go to `nougat.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
-  ```
-  pdf_to_md('~/Downloads/MLS.pdf', 'textbook')
+- run `pip install pix2tex` to install pix2text
+- Go to `Scrape_pdf.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
+- change the path in Scrape_pdf.py to your file path and run  
+
   ```
 - After you get your markdown folder now run `header.py` to segment the contents of the markdown file into headers and contents.
   ```
   # TODO
-  parser = MarkdownParser('textbook/MLS.mmd')
   ```
-- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
+- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
+nougat will speed up at computer with gpu
diff --git a/rag/scraper/Scrape_pdf/__init__.py b/rag/scraper/Scrape_pdf/__init__.py
diff --git a/rag/scraper/Scrape_pdf/nougat.py b/rag/scraper/Scrape_pdf/nougat.py
@@ -1,7 +1,7 @@
 import os
 def pdf_to_md(pdf_file_path, folder_name):
     # Command to execute
-    command = f"nougat {pdf_file_path} -o {folder_name}"
+    command = f"nougat {pdf_file_path} -o {folder_name} -m 0.1.0-base --no-skipping"
     # Run the command
     os.system(command)
 

diff --git a/tests/utils.py b/tests/utils.py
@@ -7,6 +7,7 @@
 from difflib import SequenceMatcher
 from pathlib import Path
 from typing import List, Union
+from typing import Set
 
 from colorama import Fore, Style, init
 
@@ -133,8 +134,13 @@ def compare_folders(expected_dir: Path, output_dir: Path, similarity_threshold:
     Returns:
         bool: True if the folders match, False otherwise.
     """
-    expected_files = {file.relative_to(expected_dir) for file in expected_dir.rglob("*") if file.is_file()}
-    output_files = {file.relative_to(output_dir) for file in output_dir.rglob("*") if file.is_file()}
+
+    #Because .pdf file are not necessary to be compared so we ignore they for now
+    def get_non_pdf_files(dir: Path) -> Set[Path]:
+        return {file.relative_to(dir) for file in dir.rglob("*") if file.is_file() and file.suffix.lower() != ".pdf"}
+
+    expected_files = get_non_pdf_files(expected_dir)
+    output_files = get_non_pdf_files(output_dir)
 
     all_matched = True
     # Compare common files