Skip to content

Commit

Permalink
Yikang/new convsersion (#56)
Browse files Browse the repository at this point in the history
* commiting scraper_pdf folder. remove ignore for now

* YIkang's scrape pdf

* edited router

* edited nougat

* readme

* added requirements

* added pix2text

* fix api question

* try new functing

* fix compare question

* update pdf_scrape readme

* fixed fialed and passed test

* updata readme and passed api test

* modified api

* ignore pdf file while comparing

* ignore extra pdf while comparing

* remove api main func

* move funcs in pdf converter class
  • Loading branch information
Catrunaround committed Jul 16, 2024
1 parent ea2b52e commit 6a2875e
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 15 deletions.
2 changes: 1 addition & 1 deletion rag/.gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
/scraper/Scrape_header/*/
/scraper/Scrape_rst/*/
/scraper/Scrape_md/*/
/scraper/Scrape_pdf/*/
# /scraper/Scrape_pdf/*/

# Remove all videos and audio files
*mp4
Expand Down
1 change: 1 addition & 0 deletions rag/file_conversion_router/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ def convert_directory(input_dir: Union[str, Path], output_dir: Union[str, Path])
2. Markdown (To clarify, this markdown includes additional tree structure of original markdown file)
"""
process_folder(input_dir, output_dir)

124 changes: 119 additions & 5 deletions rag/file_conversion_router/conversion/pdf_converter.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
import subprocess
from pathlib import Path
import os
import fitz
import re
from pix2text import Pix2Text


from rag.file_conversion_router.conversion.base_converter import BaseConverter
from rag.file_conversion_router.utils.hardware_detection import detect_gpu_setup
from rag.file_conversion_router.classes.page import Page
from rag.file_conversion_router.classes.chunk import Chunk
import yaml


class PdfConverter(BaseConverter):
def __init__(self, model_tag: str = "0.1.0-small", batch_size: int = 4):
super().__init__()
Expand All @@ -29,21 +35,126 @@ def _validate_parameters(self):
acceptable_models = ["0.1.0-small", "0.1.0-base"]
if self.model_tag not in acceptable_models:
raise ValueError(f"Model tag must be one of {acceptable_models}")

def convert_pdf_to_markdown(self, pdf_file_path, output_file_path, page_numbers=None):
# """
# Convert a PDF file to Markdown format.

# Parameters:
# pdf_file_path (str): The file path of the input PDF.
# output_file_path (str): The file path where the output Markdown will be saved.
# page_numbers (list of int, optional): List of page numbers to process. Defaults to None (process all pages).
# """
try:
# Initialize Pix2Text with default configuration
p2t = Pix2Text.from_config()

# Recognize text in the PDF
doc = p2t.recognize_pdf(pdf_file_path, page_numbers=page_numbers)

# Save the recognized text to a Markdown file
doc.to_markdown(output_file_path)

print(f"Markdown saved to {output_file_path}")
except Exception as e:
print(f"An error occurred: {e}")

def remove_images_from_pdf(self, input_path: Path, output_path: Path):
pdf_document = fitz.open(input_path)

for page_num in range(len(pdf_document)):
page = pdf_document.load_page(page_num)
images = page.get_images(full=True)

# Remove each image
for img_index in range(len(images) - 1, -1, -1):
xref = images[img_index][0]
page.delete_image(xref)

# Optionally clean up empty spaces
page.clean_contents()
pdf_document.save(output_path)
pdf_document.close()

def extract_and_convert_pdf_to_md(self, pdf_path, md_path, output_folder):
# Open the PDF document
pdf_document = fitz.open(pdf_path)

# Check if the Markdown file exists
if not os.path.exists(md_path):
print(f"Markdown file does not exist: {md_path}")
return

# Read the existing Markdown content
with open(md_path, 'r', encoding='utf-8') as md_file:
markdown_content = md_file.read()

# Match all forms of MISSING_PAGE markers
missing_pages = re.findall(r'\[MISSING_PAGE.*?:(\d+)\]', markdown_content)

# Extract missing pages as separate PDF files
for page_number in missing_pages:
page_index = int(page_number) - 1
page = pdf_document.load_page(page_index)
single_page_pdf_path = os.path.join(output_folder, f"page_{page_number}.pdf")
single_page_document = fitz.open()
single_page_document.insert_pdf(pdf_document, from_page=page_index, to_page=page_index)
single_page_document.save(single_page_pdf_path)

# Run Nougat on the single page PDF
single_page_output_folder = os.path.join(output_folder, f"page_{page_number}_output")
if not os.path.exists(single_page_output_folder):
os.makedirs(single_page_output_folder)
self.convert_pdf_to_markdown(single_page_pdf_path, single_page_output_folder)

# Read the generated Markdown content for this page
single_page_md_files = os.listdir(single_page_output_folder)
if not single_page_md_files:
print(f"No Markdown file generated for page {page_number}")
continue

single_page_md_path = os.path.join(single_page_output_folder, single_page_md_files[0])
with open(single_page_md_path, 'r', encoding='utf-8') as single_page_md_file:
single_page_md_content = single_page_md_file.read()

# Escape backslashes in single_page_md_content
single_page_md_content = single_page_md_content.replace('\\', '\\\\')

# Replace the missing page marker with the actual content
markdown_content = re.sub(rf'\[MISSING_PAGE.*?:{page_number}\]', single_page_md_content, markdown_content)

pdf_document.close()

# Save the updated Markdown content
with open(md_path, 'w', encoding='utf-8') as md_file:
md_file.write(markdown_content)

# Override
def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
# """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration."""
temp_dir_path = output_path.parent

# Create the directory if it doesn't exist
if not temp_dir_path.exists():
os.makedirs(temp_dir_path)

# Define the path for the PDF without images in the output directory
pdf_without_images_path = temp_dir_path / input_path.name

# Remove images from the PDF and save to the output directory
self.remove_images_from_pdf(input_path, pdf_without_images_path)

command = [
"nougat",
str(input_path),
str(pdf_without_images_path),
# nougat requires the argument output path to be a directory, not file, so we need to handle it here
"-o",
str(output_path.parent),
"--no-skipping",
"--model",
self.model_tag,
"--batchsize",
str(self.batch_size),
# "--batchsize",
# str(self.batch_size),
]
try:
result = subprocess.run(command, check=False, capture_output=True, text=True)
Expand All @@ -54,9 +165,10 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
except Exception as e:
self._logger.error(f"An error occurred: {str(e)}")
raise

# Now change the file name of generated mmd file to align with the expected md file path from base converter
output_mmd_path = output_path.with_suffix(".mmd")
self.extract_and_convert_pdf_to_md(str(pdf_without_images_path), str(output_mmd_path), str(temp_dir_path))
# Rename it to `md` file
target = output_path.with_suffix(".md")
output_mmd_path.rename(target)
Expand All @@ -70,7 +182,7 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
input_path = self._to_markdown(input_path, output_path)
input_path = self._to_markdown(input_path, output_path,)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise
Expand All @@ -81,7 +193,9 @@ def _to_page(self, input_path: Path, output_path: Path) -> Page:
with open(input_path, "r") as input_file:
text = input_file.read()


metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)

4 changes: 4 additions & 0 deletions rag/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,9 @@ tokenizers==0.15.2
torch==2.3.0
tqdm==4.66.2
transformers==4.38.2
pymupdf==1.24.5
pymupdfb==1.24.3
voyageai==0.2.2
pix2text==1.1.1
rst_to_myst==0.4.0

13 changes: 7 additions & 6 deletions rag/scraper/Scrape_pdf/README.md
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
# Scrape_pdf
First we will need to convert the pdf into a markdown format. We will use a tool called nougat.
First we will need to convert the pdf into a markdown format. We will use two tools called nougat and pix2text.
- run `pip install nougat-ocr` to install nougat
- Go to `nougat.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
```
pdf_to_md('~/Downloads/MLS.pdf', 'textbook')
- run `pip install pix2tex` to install pix2text
- Go to `Scrape_pdf.py` and choose the pdf you want to convert and the name of the folder you want to save your documents at.
- change the path in Scrape_pdf.py to your file path and run

```
- After you get your markdown folder now run `header.py` to segment the contents of the markdown file into headers and contents.
```
# TODO
parser = MarkdownParser('textbook/MLS.mmd')
```
- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
- After you have set up the variables you can run `python3 scrape.py` and it will start scraping the website.
nougat will speed up at computer with gpu
Empty file.
2 changes: 1 addition & 1 deletion rag/scraper/Scrape_pdf/nougat.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import os
def pdf_to_md(pdf_file_path, folder_name):
# Command to execute
command = f"nougat {pdf_file_path} -o {folder_name}"
command = f"nougat {pdf_file_path} -o {folder_name} -m 0.1.0-base --no-skipping"
# Run the command
os.system(command)

Expand Down
10 changes: 8 additions & 2 deletions tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from difflib import SequenceMatcher
from pathlib import Path
from typing import List, Union
from typing import Set

from colorama import Fore, Style, init

Expand Down Expand Up @@ -133,8 +134,13 @@ def compare_folders(expected_dir: Path, output_dir: Path, similarity_threshold:
Returns:
bool: True if the folders match, False otherwise.
"""
expected_files = {file.relative_to(expected_dir) for file in expected_dir.rglob("*") if file.is_file()}
output_files = {file.relative_to(output_dir) for file in output_dir.rglob("*") if file.is_file()}

#Because .pdf file are not necessary to be compared so we ignore they for now
def get_non_pdf_files(dir: Path) -> Set[Path]:
return {file.relative_to(dir) for file in dir.rglob("*") if file.is_file() and file.suffix.lower() != ".pdf"}

expected_files = get_non_pdf_files(expected_dir)
output_files = get_non_pdf_files(output_dir)

all_matched = True
# Compare common files
Expand Down

0 comments on commit 6a2875e

Please sign in to comment.