diff --git a/rag/.gitignore b/rag/.gitignore index 2b0a4f1..4d8a551 100644 --- a/rag/.gitignore +++ b/rag/.gitignore @@ -6,7 +6,7 @@ /scraper/Scrape_header/*/ /scraper/Scrape_rst/*/ /scraper/Scrape_md/*/ -/scraper/Scrape_pdf/*/ +# /scraper/Scrape_pdf/*/ # Remove all videos and audio files *mp4 diff --git a/rag/file_conversion_router/api.py b/rag/file_conversion_router/api.py index 6983d46..418c7e4 100644 --- a/rag/file_conversion_router/api.py +++ b/rag/file_conversion_router/api.py @@ -15,3 +15,4 @@ def convert_directory(input_dir: Union[str, Path], output_dir: Union[str, Path]) 2. Markdown (To clarify, this markdown includes additional tree structure of original markdown file) """ process_folder(input_dir, output_dir) + diff --git a/rag/file_conversion_router/conversion/pdf_converter.py b/rag/file_conversion_router/conversion/pdf_converter.py index 493365f..4ca3875 100644 --- a/rag/file_conversion_router/conversion/pdf_converter.py +++ b/rag/file_conversion_router/conversion/pdf_converter.py @@ -1,5 +1,10 @@ import subprocess from pathlib import Path +import os +import fitz +import re +from pix2text import Pix2Text + from rag.file_conversion_router.conversion.base_converter import BaseConverter from rag.file_conversion_router.utils.hardware_detection import detect_gpu_setup @@ -7,6 +12,7 @@ from rag.file_conversion_router.classes.chunk import Chunk import yaml + class PdfConverter(BaseConverter): def __init__(self, model_tag: str = "0.1.0-small", batch_size: int = 4): super().__init__() @@ -29,21 +35,126 @@ def _validate_parameters(self): acceptable_models = ["0.1.0-small", "0.1.0-base"] if self.model_tag not in acceptable_models: raise ValueError(f"Model tag must be one of {acceptable_models}") + + def convert_pdf_to_markdown(self, pdf_file_path, output_file_path, page_numbers=None): + # """ + # Convert a PDF file to Markdown format. + + # Parameters: + # pdf_file_path (str): The file path of the input PDF. + # output_file_path (str): The file path where the output Markdown will be saved. + # page_numbers (list of int, optional): List of page numbers to process. Defaults to None (process all pages). + # """ + try: + # Initialize Pix2Text with default configuration + p2t = Pix2Text.from_config() + + # Recognize text in the PDF + doc = p2t.recognize_pdf(pdf_file_path, page_numbers=page_numbers) + + # Save the recognized text to a Markdown file + doc.to_markdown(output_file_path) + + print(f"Markdown saved to {output_file_path}") + except Exception as e: + print(f"An error occurred: {e}") + + def remove_images_from_pdf(self, input_path: Path, output_path: Path): + pdf_document = fitz.open(input_path) + + for page_num in range(len(pdf_document)): + page = pdf_document.load_page(page_num) + images = page.get_images(full=True) + + # Remove each image + for img_index in range(len(images) - 1, -1, -1): + xref = images[img_index][0] + page.delete_image(xref) + + # Optionally clean up empty spaces + page.clean_contents() + pdf_document.save(output_path) + pdf_document.close() + + def extract_and_convert_pdf_to_md(self, pdf_path, md_path, output_folder): + # Open the PDF document + pdf_document = fitz.open(pdf_path) + + # Check if the Markdown file exists + if not os.path.exists(md_path): + print(f"Markdown file does not exist: {md_path}") + return + + # Read the existing Markdown content + with open(md_path, 'r', encoding='utf-8') as md_file: + markdown_content = md_file.read() + + # Match all forms of MISSING_PAGE markers + missing_pages = re.findall(r'\[MISSING_PAGE.*?:(\d+)\]', markdown_content) + + # Extract missing pages as separate PDF files + for page_number in missing_pages: + page_index = int(page_number) - 1 + page = pdf_document.load_page(page_index) + single_page_pdf_path = os.path.join(output_folder, f"page_{page_number}.pdf") + single_page_document = fitz.open() + single_page_document.insert_pdf(pdf_document, from_page=page_index, to_page=page_index) + single_page_document.save(single_page_pdf_path) + + # Run Nougat on the single page PDF + single_page_output_folder = os.path.join(output_folder, f"page_{page_number}_output") + if not os.path.exists(single_page_output_folder): + os.makedirs(single_page_output_folder) + self.convert_pdf_to_markdown(single_page_pdf_path, single_page_output_folder) + + # Read the generated Markdown content for this page + single_page_md_files = os.listdir(single_page_output_folder) + if not single_page_md_files: + print(f"No Markdown file generated for page {page_number}") + continue + + single_page_md_path = os.path.join(single_page_output_folder, single_page_md_files[0]) + with open(single_page_md_path, 'r', encoding='utf-8') as single_page_md_file: + single_page_md_content = single_page_md_file.read() + + # Escape backslashes in single_page_md_content + single_page_md_content = single_page_md_content.replace('\\', '\\\\') + + # Replace the missing page marker with the actual content + markdown_content = re.sub(rf'\[MISSING_PAGE.*?:{page_number}\]', single_page_md_content, markdown_content) + + pdf_document.close() + + # Save the updated Markdown content + with open(md_path, 'w', encoding='utf-8') as md_file: + md_file.write(markdown_content) # Override def _to_markdown(self, input_path: Path, output_path: Path) -> Path: # """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration.""" + temp_dir_path = output_path.parent + + # Create the directory if it doesn't exist + if not temp_dir_path.exists(): + os.makedirs(temp_dir_path) + + # Define the path for the PDF without images in the output directory + pdf_without_images_path = temp_dir_path / input_path.name + + # Remove images from the PDF and save to the output directory + self.remove_images_from_pdf(input_path, pdf_without_images_path) + command = [ "nougat", - str(input_path), + str(pdf_without_images_path), # nougat requires the argument output path to be a directory, not file, so we need to handle it here "-o", str(output_path.parent), "--no-skipping", "--model", self.model_tag, - "--batchsize", - str(self.batch_size), + # "--batchsize", + # str(self.batch_size), ] try: result = subprocess.run(command, check=False, capture_output=True, text=True) @@ -54,9 +165,10 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path: except Exception as e: self._logger.error(f"An error occurred: {str(e)}") raise - + # Now change the file name of generated mmd file to align with the expected md file path from base converter output_mmd_path = output_path.with_suffix(".mmd") + self.extract_and_convert_pdf_to_md(str(pdf_without_images_path), str(output_mmd_path), str(temp_dir_path)) # Rename it to `md` file target = output_path.with_suffix(".md") output_mmd_path.rename(target) @@ -70,7 +182,7 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path: def _to_page(self, input_path: Path, output_path: Path) -> Page: """Perform Markdown to Page conversion.""" try: - input_path = self._to_markdown(input_path, output_path) + input_path = self._to_markdown(input_path, output_path,) except Exception as e: self._logger.error(f"An error occurred during markdown conversion: {str(e)}") raise @@ -81,7 +193,9 @@ def _to_page(self, input_path: Path, output_path: Path) -> Page: with open(input_path, "r") as input_file: text = input_file.read() + metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml") metadata_content = self._read_metadata(metadata_path) url = metadata_content.get("URL") return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url) + diff --git a/rag/requirements.txt b/rag/requirements.txt index 71bc240..f56ba65 100644 --- a/rag/requirements.txt +++ b/rag/requirements.txt @@ -23,5 +23,9 @@ tokenizers==0.15.2 torch==2.3.0 tqdm==4.66.2 transformers==4.38.2 +pymupdf==1.24.5 +pymupdfb==1.24.3 voyageai==0.2.2 +pix2text==1.1.1 rst_to_myst==0.4.0 + diff --git a/rag/scraper/Scrape_pdf/README.md b/rag/scraper/Scrape_pdf/README.md index a61dddd..5f14763 100644 --- a/rag/scraper/Scrape_pdf/README.md +++ b/rag/scraper/Scrape_pdf/README.md @@ -1,13 +1,14 @@ # Scrape_pdf -First we will need to convert the pdf into a markdown format. We will use a tool called nougat. +First we will need to convert the pdf into a markdown format. We will use two tools called nougat and pix2text. - run `pip install nougat-ocr` to install nougat -- Go to `nougat.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at. - ``` - pdf_to_md('~/Downloads/MLS.pdf', 'textbook') +- run `pip install pix2tex` to install pix2text +- Go to `Scrape_pdf.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at. +- change the path in Scrape_pdf.py to your file path and run + ``` - After you get your markdown folder now run `header.py` to segment the contents of the markdown file into headers and contents. ``` # TODO - parser = MarkdownParser('textbook/MLS.mmd') ``` -- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website. \ No newline at end of file +- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website. +nougat will speed up at computer with gpu \ No newline at end of file diff --git a/rag/scraper/Scrape_pdf/__init__.py b/rag/scraper/Scrape_pdf/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/rag/scraper/Scrape_pdf/nougat.py b/rag/scraper/Scrape_pdf/nougat.py index fec1b5c..1f6fd76 100644 --- a/rag/scraper/Scrape_pdf/nougat.py +++ b/rag/scraper/Scrape_pdf/nougat.py @@ -1,7 +1,7 @@ import os def pdf_to_md(pdf_file_path, folder_name): # Command to execute - command = f"nougat {pdf_file_path} -o {folder_name}" + command = f"nougat {pdf_file_path} -o {folder_name} -m 0.1.0-base --no-skipping" # Run the command os.system(command) diff --git a/tests/utils.py b/tests/utils.py index fd5dce1..615553f 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -7,6 +7,7 @@ from difflib import SequenceMatcher from pathlib import Path from typing import List, Union +from typing import Set from colorama import Fore, Style, init @@ -133,8 +134,13 @@ def compare_folders(expected_dir: Path, output_dir: Path, similarity_threshold: Returns: bool: True if the folders match, False otherwise. """ - expected_files = {file.relative_to(expected_dir) for file in expected_dir.rglob("*") if file.is_file()} - output_files = {file.relative_to(output_dir) for file in output_dir.rglob("*") if file.is_file()} + + #Because .pdf file are not necessary to be compared so we ignore they for now + def get_non_pdf_files(dir: Path) -> Set[Path]: + return {file.relative_to(dir) for file in dir.rglob("*") if file.is_file() and file.suffix.lower() != ".pdf"} + + expected_files = get_non_pdf_files(expected_dir) + output_files = get_non_pdf_files(output_dir) all_matched = True # Compare common files