Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Yikang testing #47

Closed
wants to merge 7 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rag/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
/scraper/Scrape_header/*/
/scraper/Scrape_rst/*/
/scraper/Scrape_md/*/
/scraper/Scrape_pdf/*/
# /scraper/Scrape_pdf/*/

# Remove all videos and audio files
*mp4
Expand Down
39 changes: 26 additions & 13 deletions rag/file_conversion_router/conversion/pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,16 @@
import subprocess
from pathlib import Path
import os

from rag.file_conversion_router.conversion.base_converter import BaseConverter
from rag.file_conversion_router.utils.hardware_detection import detect_gpu_setup
from rag.file_conversion_router.classes.page import Page
from rag.file_conversion_router.classes.chunk import Chunk
import yaml

from rag.scraper.Scrape_pdf.pdf_helper import generate_mmd_file_path
from rag.scraper.Scrape_pdf.Scrape_pdf import process_pdf, pdf_to_md

class PdfConverter(BaseConverter):
def __init__(self, model_tag: str = "0.1.0-small", batch_size: int = 4):
super().__init__()
Expand All @@ -32,26 +36,34 @@ def _validate_parameters(self):

# Override
def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
input_pdf = str(input_path)
folder_name = str(output_path.parent)
output_pdf = process_pdf(input_pdf)

# """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration."""
# command = [
# "nougat",
# str(input_path),
# # nougat requires the argument output path to be a directory, not file, so we need to handle it here
# "-o",
# str(output_path.parent),
# "--no-skipping",
# "--model",
# self.model_tag,
# "--batchsize",
# str(self.batch_size),
# ]
command = [
"nougat",
str(input_path),
# nougat requires the argument output path to be a directory, not file, so we need to handle it here
"-o",
str(output_path.parent),
"--no-skipping",
"--model",
self.model_tag,
"--batchsize",
str(self.batch_size),
]

mmd_file_path = generate_mmd_file_path(folder_name)
pdf_to_md(output_pdf, folder_name, mmd_file_path)
# try:
# result = subprocess.run(command, check=False, capture_output=True, text=True)
# self._logger.info(f"Output: {result.stdout}")
# self._logger.info(f"Errors: {result.stderr}")
# if result.returncode != 0:
# self._logger.error(f"Command exited with a non-zero status: {result.returncode}")
# # Now change the file name of generated mmd file to align with the expected md file path from base converter
output_path = Path(mmd_file_path)
output_mmd_path = output_path.with_suffix(".mmd")
# Rename it to `md` file
target = output_path.with_suffix(".md")
Expand Down Expand Up @@ -79,4 +91,5 @@ def _to_page(self, input_path: Path, output_path: Path) -> Page:
metadata_content = yaml.safe_load(metadata_file)
url = metadata_content.get("URL", None)
return Page(content={'text': text}, filetype=filetype, page_url=url)



3 changes: 3 additions & 0 deletions rag/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,4 +23,7 @@ tokenizers==0.15.2
torch==2.3.0
tqdm==4.66.2
transformers==4.38.2
pymupdf==1.24.5
pymupdfb==1.24.3
voyageai==0.2.2
pix2text==1.1.1
9 changes: 4 additions & 5 deletions rag/scraper/Scrape_pdf/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
# Scrape_pdf
First we will need to convert the pdf into a markdown format. We will use a tool called nougat.
- run `pip install nougat-ocr` to install nougat
- Go to `nougat.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
```
pdf_to_md('~/Downloads/MLS.pdf', 'textbook')
- Go to `Scrape_pdf.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
- change the path in Scrape_pdf.py to your file path and run

```
- After you get your markdown folder now run `header.py` to segment the contents of the markdown file into headers and contents.
```
# TODO
parser = MarkdownParser('textbook/MLS.mmd')
```
- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
59 changes: 59 additions & 0 deletions rag/scraper/Scrape_pdf/Scrape_pdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from nougat import pdf_to_md
import fitz
from pathlib import Path
from pdf_helper import missing_page_fill
import os


def add_remove_image_suffix(pdf_path: str) -> str:
"""Add '_remove_image' suffix to a PDF file path while keeping the .pdf extension."""
input_path = Path(pdf_path)
if input_path.suffix.lower() != '.pdf':
raise ValueError("The input file must be a PDF.")

# Add the suffix before the file extension
new_path = input_path.with_stem(input_path.stem + "_remove_image")

return str(new_path)

def remove_images(input_pdf, output_pdf):
doc = fitz.open(input_pdf)

# Iterate through the pages
for page_num in range(len(doc)):
page = doc.load_page(page_num)
images = page.get_images(full=True)

# Remove each image
for img_index in range(len(images) - 1, -1, -1):
xref = images[img_index][0]
page.delete_image(xref)

# Optionally clean up empty spaces
page.clean_contents()

# Save the modified PDF
doc.save(output_pdf)
doc.close()

def process_pdf(input_pdf: str) -> str:
"""Process the PDF by removing images and saving to a new file with '_remove_image' suffix."""
output_pdf = add_remove_image_suffix(input_pdf)
remove_images(input_pdf, output_pdf)
return output_pdf

def generate_mmd_file_path(folder_path):
folder_name = os.path.basename(folder_path)
mmd_file_path = os.path.join(folder_path, f"{folder_name}_remove_image.mmd")
return mmd_file_path


input_pdf = "rag\scraper\Scrape_pdf\example_pdf.pdf"
folder_name = "rag\scraper\Scrape_pdf\example_pdf"


if __name__ == '__main__':
output_pdf = process_pdf(input_pdf)
pdf_to_md(output_pdf, folder_name)
md_file_path = generate_mmd_file_path(folder_name)
missing_page_fill(output_pdf, folder_name, md_file_path)
Empty file.
Binary file added rag/scraper/Scrape_pdf/example_pdf.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion rag/scraper/Scrape_pdf/nougat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
def pdf_to_md(pdf_file_path, folder_name):
# Command to execute
command = f"nougat {pdf_file_path} -o {folder_name}"
command = f"nougat {pdf_file_path} -o {folder_name} -m 0.1.0-base --no-skipping"
# Run the command
os.system(command)

Expand Down
94 changes: 94 additions & 0 deletions rag/scraper/Scrape_pdf/pdf_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import os
import re
import fitz
from pix2text import Pix2Text


def convert_pdf_to_markdown(pdf_file_path, output_file_path, page_numbers=None):
"""
Convert a PDF file to Markdown format.

Parameters:
pdf_file_path (str): The file path of the input PDF.
output_file_path (str): The file path where the output Markdown will be saved.
page_numbers (list of int, optional): List of page numbers to process. Defaults to None (process all pages).
"""
try:
# Initialize Pix2Text with default configuration
p2t = Pix2Text.from_config()

# Recognize text in the PDF
doc = p2t.recognize_pdf(pdf_file_path, page_numbers=page_numbers)

# Save the recognized text to a Markdown file
doc.to_markdown(output_file_path)

print(f"Markdown saved to {output_file_path}")
except Exception as e:
print(f"An error occurred: {e}")

def extract_and_convert_pdf_to_md(pdf_path, md_path, output_folder):
# Open the PDF document
pdf_document = fitz.open(pdf_path)

# Check if the Markdown file exists
if not os.path.exists(md_path):
print(f"Markdown file does not exist: {md_path}")
return

# Read the existing Markdown content
with open(md_path, 'r', encoding='utf-8') as md_file:
markdown_content = md_file.read()

# Match all forms of MISSING_PAGE markers
missing_pages = re.findall(r'\[MISSING_PAGE.*?:(\d+)\]', markdown_content)

# Extract missing pages as separate PDF files
for page_number in missing_pages:
page_index = int(page_number) - 1
page = pdf_document.load_page(page_index)
single_page_pdf_path = os.path.join(output_folder, f"page_{page_number}.pdf")
single_page_document = fitz.open()
single_page_document.insert_pdf(pdf_document, from_page=page_index, to_page=page_index)
single_page_document.save(single_page_pdf_path)

# Run Nougat on the single page PDF
single_page_output_folder = os.path.join(output_folder, f"page_{page_number}_output")
if not os.path.exists(single_page_output_folder):
os.makedirs(single_page_output_folder)
convert_pdf_to_markdown(single_page_pdf_path, single_page_output_folder)

# Read the generated Markdown content for this page
single_page_md_files = os.listdir(single_page_output_folder)
if not single_page_md_files:
print(f"No Markdown file generated for page {page_number}")
continue

single_page_md_path = os.path.join(single_page_output_folder, single_page_md_files[0])
with open(single_page_md_path, 'r', encoding='utf-8') as single_page_md_file:
single_page_md_content = single_page_md_file.read()

# Escape backslashes in single_page_md_content
single_page_md_content = single_page_md_content.replace('\\', '\\\\')

# Replace the missing page marker with the actual content
markdown_content = re.sub(rf'\[MISSING_PAGE.*?:{page_number}\]', single_page_md_content, markdown_content)

pdf_document.close()

# Save the updated Markdown content
with open(md_path, 'w', encoding='utf-8') as md_file:
md_file.write(markdown_content)

def missing_page_fill(pdf_file_path, folder_name, md_file_path):
# Ensure the output folder exists
if not os.path.exists(folder_name):
os.makedirs(folder_name)

# Extract missing pages and update Markdown content
extract_and_convert_pdf_to_md(pdf_file_path, md_file_path, folder_name)

def generate_mmd_file_path(folder_path):
folder_name = os.path.basename(folder_path)
mmd_file_path = os.path.join(folder_path, f"{folder_name}_remove_image.mmd")
return mmd_file_path
Loading