Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Yikang/pdf conversion #54

Closed
wants to merge 27 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
dec28d5
commiting scraper_pdf folder. remove ignore for now
Catrunaround Jul 6, 2024
20471b7
YIkang's scrape pdf
Catrunaround Jul 6, 2024
d129206
edited router
Catrunaround Jul 6, 2024
6f7f610
edited nougat
Catrunaround Jul 6, 2024
0fb4fee
readme
Catrunaround Jul 7, 2024
66e1df7
added requirements
Catrunaround Jul 9, 2024
ac6524e
added pix2text
Catrunaround Jul 9, 2024
439417f
fix api question
Catrunaround Jul 9, 2024
23ffdd4
try new functing
Catrunaround Jul 9, 2024
e108ef9
fix compare question
Catrunaround Jul 12, 2024
60f0773
update pdf_scrape readme
Catrunaround Jul 12, 2024
1d11573
Merge branch 'main' of https://github.com/Catrunaround/tai into yikan…
Catrunaround Jul 14, 2024
7a770a6
fixed fialed and passed test
Catrunaround Jul 14, 2024
15090e6
updata readme and passed api test
Catrunaround Jul 14, 2024
8917041
modified api
Catrunaround Jul 14, 2024
4740b8d
ignore pdf file while comparing
Catrunaround Jul 16, 2024
57675ed
ignore extra pdf while comparing
Catrunaround Jul 16, 2024
f62d023
updated readme
terrianne-zhang Jul 9, 2024
1a635a7
Update test cases for fixing converter bug
perryzjc Jul 14, 2024
daa5d9b
fix merge error
Catrunaround Jul 16, 2024
56c9b84
fixed fialed and passed test
Catrunaround Jul 14, 2024
84482b3
updata readme and passed api test
Catrunaround Jul 14, 2024
901037f
modified api
Catrunaround Jul 14, 2024
1b82a95
ignore pdf file while comparing
Catrunaround Jul 16, 2024
b89c9eb
ignore extra pdf while comparing
Catrunaround Jul 16, 2024
e81abb5
use base model
Catrunaround Jul 16, 2024
e151fd5
Merge branch 'yikang/pdf_conversion' of https://github.com/Catrunarou…
Catrunaround Jul 16, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion rag/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
/scraper/Scrape_header/*/
/scraper/Scrape_rst/*/
/scraper/Scrape_md/*/
/scraper/Scrape_pdf/*/
# /scraper/Scrape_pdf/*/

# Remove all videos and audio files
*mp4
Expand Down
3 changes: 3 additions & 0 deletions rag/file_conversion_router/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,6 @@ def convert_directory(input_dir: Union[str, Path], output_dir: Union[str, Path])
2. Markdown (To clarify, this markdown includes additional tree structure of original markdown file)
"""
process_folder(input_dir, output_dir)

if __name__ == '__main__':
convert_directory("rag\scraper\Test_pdf", "rag\scraper\Output")
126 changes: 120 additions & 6 deletions rag/file_conversion_router/conversion/pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,113 @@
import subprocess
from pathlib import Path
import os
import fitz
import re
from pix2text import Pix2Text


from rag.file_conversion_router.conversion.base_converter import BaseConverter
from rag.file_conversion_router.utils.hardware_detection import detect_gpu_setup
from rag.file_conversion_router.classes.page import Page
from rag.file_conversion_router.classes.chunk import Chunk
import yaml


def convert_pdf_to_markdown(pdf_file_path, output_file_path, page_numbers=None):
"""
Convert a PDF file to Markdown format.

Parameters:
pdf_file_path (str): The file path of the input PDF.
output_file_path (str): The file path where the output Markdown will be saved.
page_numbers (list of int, optional): List of page numbers to process. Defaults to None (process all pages).
"""
try:
# Initialize Pix2Text with default configuration
p2t = Pix2Text.from_config()

# Recognize text in the PDF
doc = p2t.recognize_pdf(pdf_file_path, page_numbers=page_numbers)

# Save the recognized text to a Markdown file
doc.to_markdown(output_file_path)

print(f"Markdown saved to {output_file_path}")
except Exception as e:
print(f"An error occurred: {e}")

def remove_images_from_pdf(input_path: Path, output_path: Path):
pdf_document = fitz.open(input_path)

for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
images = page.get_images(full=True)

# Remove each image
for img_index in range(len(images) - 1, -1, -1):
xref = images[img_index][0]
page.delete_image(xref)

# Optionally clean up empty spaces
page.clean_contents()
pdf_document.save(output_path)
pdf_document.close()

def extract_and_convert_pdf_to_md(pdf_path, md_path, output_folder):
# Open the PDF document
pdf_document = fitz.open(pdf_path)

# Check if the Markdown file exists
if not os.path.exists(md_path):
print(f"Markdown file does not exist: {md_path}")
return

# Read the existing Markdown content
with open(md_path, 'r', encoding='utf-8') as md_file:
markdown_content = md_file.read()

# Match all forms of MISSING_PAGE markers
missing_pages = re.findall(r'\[MISSING_PAGE.*?:(\d+)\]', markdown_content)

# Extract missing pages as separate PDF files
for page_number in missing_pages:
page_index = int(page_number) - 1
page = pdf_document.load_page(page_index)
single_page_pdf_path = os.path.join(output_folder, f"page_{page_number}.pdf")
single_page_document = fitz.open()
single_page_document.insert_pdf(pdf_document, from_page=page_index, to_page=page_index)
single_page_document.save(single_page_pdf_path)

# Run Nougat on the single page PDF
single_page_output_folder = os.path.join(output_folder, f"page_{page_number}_output")
if not os.path.exists(single_page_output_folder):
os.makedirs(single_page_output_folder)
convert_pdf_to_markdown(single_page_pdf_path, single_page_output_folder)

# Read the generated Markdown content for this page
single_page_md_files = os.listdir(single_page_output_folder)
if not single_page_md_files:
print(f"No Markdown file generated for page {page_number}")
continue

single_page_md_path = os.path.join(single_page_output_folder, single_page_md_files[0])
with open(single_page_md_path, 'r', encoding='utf-8') as single_page_md_file:
single_page_md_content = single_page_md_file.read()

# Escape backslashes in single_page_md_content
single_page_md_content = single_page_md_content.replace('\\', '\\\\')

# Replace the missing page marker with the actual content
markdown_content = re.sub(rf'\[MISSING_PAGE.*?:{page_number}\]', single_page_md_content, markdown_content)

pdf_document.close()

# Save the updated Markdown content
with open(md_path, 'w', encoding='utf-8') as md_file:
md_file.write(markdown_content)

class PdfConverter(BaseConverter):
def __init__(self, model_tag: str = "0.1.0-small", batch_size: int = 4):
def __init__(self, model_tag: str = "0.1.0-base", batch_size: int = 4):
super().__init__()
self.model_tag = model_tag
self.batch_size = batch_size
Expand All @@ -33,17 +132,29 @@ def _validate_parameters(self):
# Override
def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
# """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration."""
temp_dir_path = output_path.parent

# Create the directory if it doesn't exist
if not temp_dir_path.exists():
os.makedirs(temp_dir_path)

# Define the path for the PDF without images in the output directory
pdf_without_images_path = temp_dir_path / input_path.name

# Remove images from the PDF and save to the output directory
remove_images_from_pdf(input_path, pdf_without_images_path)

command = [
"nougat",
str(input_path),
str(pdf_without_images_path),
# nougat requires the argument output path to be a directory, not file, so we need to handle it here
"-o",
str(output_path.parent),
"--no-skipping",
"--model",
self.model_tag,
"--batchsize",
str(self.batch_size),
# "--batchsize",
# str(self.batch_size),
]
try:
result = subprocess.run(command, check=False, capture_output=True, text=True)
Expand All @@ -54,9 +165,10 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
except Exception as e:
self._logger.error(f"An error occurred: {str(e)}")
raise

# Now change the file name of generated mmd file to align with the expected md file path from base converter
output_mmd_path = output_path.with_suffix(".mmd")
extract_and_convert_pdf_to_md(str(pdf_without_images_path), str(output_mmd_path), str(temp_dir_path))
# Rename it to `md` file
target = output_path.with_suffix(".md")
output_mmd_path.rename(target)
Expand All @@ -70,7 +182,7 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
input_path = self._to_markdown(input_path, output_path)
input_path = self._to_markdown(input_path, output_path,)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise
Expand All @@ -81,7 +193,9 @@ def _to_page(self, input_path: Path, output_path: Path) -> Page:
with open(input_path, "r") as input_file:
text = input_file.read()


metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)

4 changes: 4 additions & 0 deletions rag/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,9 @@ tokenizers==0.15.2
torch==2.3.0
tqdm==4.66.2
transformers==4.38.2
pymupdf==1.24.5
pymupdfb==1.24.3
voyageai==0.2.2
pix2text==1.1.1
rst_to_myst==0.4.0

13 changes: 7 additions & 6 deletions rag/scraper/Scrape_pdf/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# Scrape_pdf
First we will need to convert the pdf into a markdown format. We will use a tool called nougat.
First we will need to convert the pdf into a markdown format. We will use two tools called nougat and pix2text.
- run `pip install nougat-ocr` to install nougat
- Go to `nougat.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
```
pdf_to_md('~/Downloads/MLS.pdf', 'textbook')
- run `pip install pix2tex` to install pix2text
- Go to `Scrape_pdf.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
- change the path in Scrape_pdf.py to your file path and run

```
- After you get your markdown folder now run `header.py` to segment the contents of the markdown file into headers and contents.
```
# TODO
parser = MarkdownParser('textbook/MLS.mmd')
```
- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
nougat will speed up at computer with gpu
Empty file.
2 changes: 1 addition & 1 deletion rag/scraper/Scrape_pdf/nougat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
def pdf_to_md(pdf_file_path, folder_name):
# Command to execute
command = f"nougat {pdf_file_path} -o {folder_name}"
command = f"nougat {pdf_file_path} -o {folder_name} -m 0.1.0-base --no-skipping"
# Run the command
os.system(command)

Expand Down
10 changes: 8 additions & 2 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from difflib import SequenceMatcher
from pathlib import Path
from typing import List, Union
from typing import Set

from colorama import Fore, Style, init

Expand Down Expand Up @@ -133,8 +134,13 @@ def compare_folders(expected_dir: Path, output_dir: Path, similarity_threshold:
Returns:
bool: True if the folders match, False otherwise.
"""
expected_files = {file.relative_to(expected_dir) for file in expected_dir.rglob("*") if file.is_file()}
output_files = {file.relative_to(output_dir) for file in output_dir.rglob("*") if file.is_file()}

#Because .pdf file are not necessary to be compared so we ignore they for now
def get_non_pdf_files(dir: Path) -> Set[Path]:
return {file.relative_to(dir) for file in dir.rglob("*") if file.is_file() and file.suffix.lower() != ".pdf"}

expected_files = get_non_pdf_files(expected_dir)
output_files = get_non_pdf_files(output_dir)

all_matched = True
# Compare common files
Expand Down
Loading