diff --git a/rag/.gitignore b/rag/.gitignore index 2b0a4f13..4d8a5510 100644 --- a/rag/.gitignore +++ b/rag/.gitignore @@ -6,7 +6,7 @@ /scraper/Scrape_header/*/ /scraper/Scrape_rst/*/ /scraper/Scrape_md/*/ -/scraper/Scrape_pdf/*/ +# /scraper/Scrape_pdf/*/ # Remove all videos and audio files *mp4 diff --git a/rag/file_conversion_router/conversion/pdf_converter.py b/rag/file_conversion_router/conversion/pdf_converter.py index b4282798..99e20f9e 100644 --- a/rag/file_conversion_router/conversion/pdf_converter.py +++ b/rag/file_conversion_router/conversion/pdf_converter.py @@ -1,5 +1,6 @@ import subprocess from pathlib import Path +import os from rag.file_conversion_router.conversion.base_converter import BaseConverter from rag.file_conversion_router.utils.hardware_detection import detect_gpu_setup @@ -7,6 +8,9 @@ from rag.file_conversion_router.classes.chunk import Chunk import yaml +from rag.scraper.Scrape_pdf.pdf_helper import generate_mmd_file_path +from rag.scraper.Scrape_pdf.Scrape_pdf import process_pdf, pdf_to_md + class PdfConverter(BaseConverter): def __init__(self, model_tag: str = "0.1.0-small", batch_size: int = 4): super().__init__() @@ -32,19 +36,26 @@ def _validate_parameters(self): # Override def _to_markdown(self, input_path: Path, output_path: Path) -> Path: + input_pdf = str(input_path) + folder_name = str(output_path.parent) + output_pdf = process_pdf(input_pdf) + # """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration.""" - # command = [ - # "nougat", - # str(input_path), - # # nougat requires the argument output path to be a directory, not file, so we need to handle it here - # "-o", - # str(output_path.parent), - # "--no-skipping", - # "--model", - # self.model_tag, - # "--batchsize", - # str(self.batch_size), - # ] + command = [ + "nougat", + str(input_path), + # nougat requires the argument output path to be a directory, not file, so we need to handle it here + "-o", + str(output_path.parent), + "--no-skipping", + "--model", + self.model_tag, + "--batchsize", + str(self.batch_size), + ] + + mmd_file_path = generate_mmd_file_path(folder_name) + pdf_to_md(output_pdf, folder_name, mmd_file_path) # try: # result = subprocess.run(command, check=False, capture_output=True, text=True) # self._logger.info(f"Output: {result.stdout}") @@ -52,6 +63,7 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path: # if result.returncode != 0: # self._logger.error(f"Command exited with a non-zero status: {result.returncode}") # # Now change the file name of generated mmd file to align with the expected md file path from base converter + output_path = Path(mmd_file_path) output_mmd_path = output_path.with_suffix(".mmd") # Rename it to `md` file target = output_path.with_suffix(".md") @@ -79,4 +91,5 @@ def _to_page(self, input_path: Path, output_path: Path) -> Page: metadata_content = yaml.safe_load(metadata_file) url = metadata_content.get("URL", None) return Page(content={'text': text}, filetype=filetype, page_url=url) - + + diff --git a/rag/requirements.txt b/rag/requirements.txt index 4bc6c8e7..0c5ffd28 100644 --- a/rag/requirements.txt +++ b/rag/requirements.txt @@ -23,4 +23,7 @@ tokenizers==0.15.2 torch==2.3.0 tqdm==4.66.2 transformers==4.38.2 +pymupdf==1.24.5 +pymupdfb==1.24.3 voyageai==0.2.2 +pix2text==1.1.1 \ No newline at end of file diff --git a/rag/scraper/Scrape_pdf/README.md b/rag/scraper/Scrape_pdf/README.md index a61ddddb..5d9ba815 100644 --- a/rag/scraper/Scrape_pdf/README.md +++ b/rag/scraper/Scrape_pdf/README.md @@ -1,13 +1,12 @@ # Scrape_pdf First we will need to convert the pdf into a markdown format. We will use a tool called nougat. - run `pip install nougat-ocr` to install nougat -- Go to `nougat.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at. - ``` - pdf_to_md('~/Downloads/MLS.pdf', 'textbook') +- Go to `Scrape_pdf.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at. +- change the path in Scrape_pdf.py to your file path and run + ``` - After you get your markdown folder now run `header.py` to segment the contents of the markdown file into headers and contents. ``` # TODO - parser = MarkdownParser('textbook/MLS.mmd') ``` -- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website. \ No newline at end of file +- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website. diff --git a/rag/scraper/Scrape_pdf/Scrape_pdf.py b/rag/scraper/Scrape_pdf/Scrape_pdf.py new file mode 100644 index 00000000..db09c229 --- /dev/null +++ b/rag/scraper/Scrape_pdf/Scrape_pdf.py @@ -0,0 +1,59 @@ +from nougat import pdf_to_md +import fitz +from pathlib import Path +from pdf_helper import missing_page_fill +import os + + +def add_remove_image_suffix(pdf_path: str) -> str: + """Add '_remove_image' suffix to a PDF file path while keeping the .pdf extension.""" + input_path = Path(pdf_path) + if input_path.suffix.lower() != '.pdf': + raise ValueError("The input file must be a PDF.") + + # Add the suffix before the file extension + new_path = input_path.with_stem(input_path.stem + "_remove_image") + + return str(new_path) + +def remove_images(input_pdf, output_pdf): + doc = fitz.open(input_pdf) + + # Iterate through the pages + for page_num in range(len(doc)): + page = doc.load_page(page_num) + images = page.get_images(full=True) + + # Remove each image + for img_index in range(len(images) - 1, -1, -1): + xref = images[img_index][0] + page.delete_image(xref) + + # Optionally clean up empty spaces + page.clean_contents() + + # Save the modified PDF + doc.save(output_pdf) + doc.close() + +def process_pdf(input_pdf: str) -> str: + """Process the PDF by removing images and saving to a new file with '_remove_image' suffix.""" + output_pdf = add_remove_image_suffix(input_pdf) + remove_images(input_pdf, output_pdf) + return output_pdf + +def generate_mmd_file_path(folder_path): + folder_name = os.path.basename(folder_path) + mmd_file_path = os.path.join(folder_path, f"{folder_name}_remove_image.mmd") + return mmd_file_path + + +input_pdf = "rag\scraper\Scrape_pdf\example_pdf.pdf" +folder_name = "rag\scraper\Scrape_pdf\example_pdf" + + +if __name__ == '__main__': + output_pdf = process_pdf(input_pdf) + pdf_to_md(output_pdf, folder_name) + md_file_path = generate_mmd_file_path(folder_name) + missing_page_fill(output_pdf, folder_name, md_file_path) diff --git a/rag/scraper/Scrape_pdf/__init__.py b/rag/scraper/Scrape_pdf/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rag/scraper/Scrape_pdf/example_pdf.pdf b/rag/scraper/Scrape_pdf/example_pdf.pdf new file mode 100644 index 00000000..b00cdf03 Binary files /dev/null and b/rag/scraper/Scrape_pdf/example_pdf.pdf differ diff --git a/rag/scraper/Scrape_pdf/nougat.py b/rag/scraper/Scrape_pdf/nougat.py index fec1b5c7..1f6fd76b 100644 --- a/rag/scraper/Scrape_pdf/nougat.py +++ b/rag/scraper/Scrape_pdf/nougat.py @@ -1,7 +1,7 @@ import os def pdf_to_md(pdf_file_path, folder_name): # Command to execute - command = f"nougat {pdf_file_path} -o {folder_name}" + command = f"nougat {pdf_file_path} -o {folder_name} -m 0.1.0-base --no-skipping" # Run the command os.system(command) diff --git a/rag/scraper/Scrape_pdf/pdf_helper.py b/rag/scraper/Scrape_pdf/pdf_helper.py new file mode 100644 index 00000000..e9b878a0 --- /dev/null +++ b/rag/scraper/Scrape_pdf/pdf_helper.py @@ -0,0 +1,94 @@ +import os +import re +import fitz +from pix2text import Pix2Text + + +def convert_pdf_to_markdown(pdf_file_path, output_file_path, page_numbers=None): + """ + Convert a PDF file to Markdown format. + + Parameters: + pdf_file_path (str): The file path of the input PDF. + output_file_path (str): The file path where the output Markdown will be saved. + page_numbers (list of int, optional): List of page numbers to process. Defaults to None (process all pages). + """ + try: + # Initialize Pix2Text with default configuration + p2t = Pix2Text.from_config() + + # Recognize text in the PDF + doc = p2t.recognize_pdf(pdf_file_path, page_numbers=page_numbers) + + # Save the recognized text to a Markdown file + doc.to_markdown(output_file_path) + + print(f"Markdown saved to {output_file_path}") + except Exception as e: + print(f"An error occurred: {e}") + +def extract_and_convert_pdf_to_md(pdf_path, md_path, output_folder): + # Open the PDF document + pdf_document = fitz.open(pdf_path) + + # Check if the Markdown file exists + if not os.path.exists(md_path): + print(f"Markdown file does not exist: {md_path}") + return + + # Read the existing Markdown content + with open(md_path, 'r', encoding='utf-8') as md_file: + markdown_content = md_file.read() + + # Match all forms of MISSING_PAGE markers + missing_pages = re.findall(r'\[MISSING_PAGE.*?:(\d+)\]', markdown_content) + + # Extract missing pages as separate PDF files + for page_number in missing_pages: + page_index = int(page_number) - 1 + page = pdf_document.load_page(page_index) + single_page_pdf_path = os.path.join(output_folder, f"page_{page_number}.pdf") + single_page_document = fitz.open() + single_page_document.insert_pdf(pdf_document, from_page=page_index, to_page=page_index) + single_page_document.save(single_page_pdf_path) + + # Run Nougat on the single page PDF + single_page_output_folder = os.path.join(output_folder, f"page_{page_number}_output") + if not os.path.exists(single_page_output_folder): + os.makedirs(single_page_output_folder) + convert_pdf_to_markdown(single_page_pdf_path, single_page_output_folder) + + # Read the generated Markdown content for this page + single_page_md_files = os.listdir(single_page_output_folder) + if not single_page_md_files: + print(f"No Markdown file generated for page {page_number}") + continue + + single_page_md_path = os.path.join(single_page_output_folder, single_page_md_files[0]) + with open(single_page_md_path, 'r', encoding='utf-8') as single_page_md_file: + single_page_md_content = single_page_md_file.read() + + # Escape backslashes in single_page_md_content + single_page_md_content = single_page_md_content.replace('\\', '\\\\') + + # Replace the missing page marker with the actual content + markdown_content = re.sub(rf'\[MISSING_PAGE.*?:{page_number}\]', single_page_md_content, markdown_content) + + pdf_document.close() + + # Save the updated Markdown content + with open(md_path, 'w', encoding='utf-8') as md_file: + md_file.write(markdown_content) + +def missing_page_fill(pdf_file_path, folder_name, md_file_path): + # Ensure the output folder exists + if not os.path.exists(folder_name): + os.makedirs(folder_name) + + # Extract missing pages and update Markdown content + extract_and_convert_pdf_to_md(pdf_file_path, md_file_path, folder_name) + +def generate_mmd_file_path(folder_path): + folder_name = os.path.basename(folder_path) + mmd_file_path = os.path.join(folder_path, f"{folder_name}_remove_image.mmd") + return mmd_file_path