Skip to content

Commit

Permalink
Fix converter bug and passed tests
Browse files Browse the repository at this point in the history
  • Loading branch information
perryzjc authored and FranardoHuang committed Jul 14, 2024
1 parent 10284ea commit 5090f4c
Show file tree
Hide file tree
Showing 8 changed files with 95 additions and 53 deletions.
12 changes: 10 additions & 2 deletions rag/file_conversion_router/classes/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from rag.file_conversion_router.classes.chunk import Chunk
import tiktoken
import pickle


class Page:
def __init__(self, pagename: str, content: dict, filetype: str, page_url: str = ""):
"""
Expand Down Expand Up @@ -31,6 +33,7 @@ def recursive_separate(self, response: str, token_limit: int = 400) -> list:
Returns:
list: List of separated text chunks.
"""

def token_size(sentence: str) -> int:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
return len(encoding.encode(sentence))
Expand Down Expand Up @@ -112,8 +115,13 @@ def count_consecutive_hashes(s):
headers_content.append((curheader, current_content))

return headers_content

def page_seperate_to_segments(self) -> None:
self.segments = [i for i in self.extract_headers_and_content(self.content['text'])]
if not self.segments:
# LEVEL 0 for no header found
self.segments = [(("NO ANY HEADER DETECTED", 0),
self.content['text'])]

def print_header_tree(self):
result = ""
Expand Down Expand Up @@ -215,13 +223,14 @@ def generate_hyperlink_header(header_text):
hyperlink_header = lower_text.replace(' ', '-')

return hyperlink_header

# seperate with recursive seperate
for i in self.tree_segments:
content_chunks = self.recursive_separate(i['Segment_print'], 400)
for count, content_chunk in enumerate(content_chunks):
headers = i['Page_path']
urls = [f"{self.page_url}#{generate_hyperlink_header(header)}" for header in headers]
page_path = ' > '.join(f"{item} (h{i+1})" for i, item in enumerate(i['Page_path'])) + f" ({count})"
page_path = ' > '.join(f"{item} (h{i + 1})" for i, item in enumerate(i['Page_path'])) + f" ({count})"
self.chunks.append(Chunk(page_path, content_chunk, urls))
return self.chunks

Expand Down Expand Up @@ -255,4 +264,3 @@ def chunks_to_pkl(self, output_path: str) -> None:
"""
with open(output_path, "wb") as f:
pickle.dump(self.chunks, f)

45 changes: 34 additions & 11 deletions rag/file_conversion_router/conversion/base_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,12 @@
from shutil import copy2
from threading import Lock
from typing import Dict, List, Union
import yaml

from rag.file_conversion_router.utils.logger import conversion_logger, logger
from rag.file_conversion_router.utils.markdown_parser import MarkdownParser
from rag.file_conversion_router.utils.utils import calculate_hash, ensure_path
from rag.file_conversion_router.classes.page import Page
from rag.file_conversion_router.classes.vidpage import VidPage


class BaseConverter(ABC):
"""Base classes for all file type converters.
Expand All @@ -33,7 +33,6 @@ def __init__(self):
self._md_parser = None

self._md_path = None
self._tree_txt_path = None
self._pkl_path = None

self._logger = logger
Expand Down Expand Up @@ -105,35 +104,59 @@ def _setup_output_paths(self, input_path: Union[str, Path], output_folder: Union
# TODO: current MarkdownParser does not support custom output paths,
# below paths are only used for caching purposes at the moment,
# since the markdown parser generates below file paths by default
self._tree_txt_path = ensure_path(output_folder / f"{input_path.stem}.md.tree.txt")
self._pkl_path = ensure_path(output_folder / f"{input_path.stem}.md.pkl")
self._pkl_path = ensure_path(output_folder / f"{input_path.stem}.pkl")

def _convert_and_cache(self, input_path: Path, output_folder: Path, file_hash: str) -> List[Path]:
self._setup_output_paths(input_path, output_folder)
# This method embeds the abstract method `_to_markdown`, which needs to be implemented by the child classes.
_, conversion_time = self._perform_conversion(input_path, output_folder)
paths = [self._md_path, self._tree_txt_path, self._pkl_path]
paths = [self._md_path, self._pkl_path]
ConversionCache.set_cached_paths_and_time(file_hash, paths, conversion_time)

def _use_cached_files(self, cached_paths: List[Path], output_folder: Path) -> None:
"""Use cached files and copy them to the specified output folder."""
output_folder = ensure_path(output_folder)
output_folder.mkdir(parents=True, exist_ok=True)

md_path, tree_txt_path, pkl_path = cached_paths
md_path, pkl_path = cached_paths
correct_file_name = self._md_path.stem
for path, suffix in zip((md_path, tree_txt_path, pkl_path), (".md", ".md.tree.txt", ".md.pkl")):
for path, suffix in zip((md_path, pkl_path), (".md", ".pkl")):
des_path = Path(copy2(path, output_folder))
des_path.rename(output_folder / f"{correct_file_name}{suffix}")
self._logger.info(f"Copied cached file from {path} to {des_path}.")

def _read_metadata(self, metadata_path: Path) -> dict:
"""Read metadata from file or return mocked data if file doesn't exist."""
if metadata_path.exists():
try:
with open(metadata_path, "r") as metadata_file:
return yaml.safe_load(metadata_file)
except Exception as e:
self._logger.error(f"Error reading metadata file: {str(e)}")
return self._get_mocked_metadata()
else:
self._logger.warning(f"Metadata file not found: {metadata_path}. Using mocked metadata.")
return self._get_mocked_metadata()

@staticmethod
def _get_mocked_metadata() -> dict:
"""Return mocked metadata when the actual metadata file is missing."""
return {
"URL": "URL_NOT_AVAILABLE",
# Add other mocked metadata fields as needed
}

@conversion_logger
def _perform_conversion(self, input_path: Path, output_folder: Path) -> None:
"""Perform the file conversion process."""
page = self._convert_to_page(input_path, output_folder)[0]
if not output_folder.exists():
output_folder.mkdir(parents=True, exist_ok=True)
logger.warning(f"Output folder did not exist, it's now created: {output_folder}")
filename = output_folder.stem
pkl_output_path = output_folder / f"{filename}.pkl"
page = self._convert_to_page(input_path, pkl_output_path)[0]
page.to_chunk()
pkl_file = output_folder.with_suffix(".pkl")
page.chunks_to_pkl(str(pkl_file))
page.chunks_to_pkl(str(pkl_output_path))

@abstractmethod
def _to_page(self, input_path: Path, output_path: Path) -> Page:
Expand Down
22 changes: 13 additions & 9 deletions rag/file_conversion_router/conversion/md_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,20 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
return output_path
def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
try:
input_path = self._to_markdown(input_path, output_path)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)
parent = input_path.parent
stem = input_path.stem
filetype = input_path.suffix.split(".")[1]

filetype = input_path.suffix.lstrip('.')
with open(input_path, "r") as input_file:
text = input_file.read()
metadata = parent / (stem+"_metadata.yaml")
with open(metadata, "r") as metadata_file:
metadata_content = yaml.safe_load(metadata_file)
url = metadata_content["URL"]
page = Page(pagename=stem, content={'text': text}, filetype=filetype, page_url=url)
return page

metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)

63 changes: 34 additions & 29 deletions rag/file_conversion_router/conversion/pdf_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,25 +33,29 @@ def _validate_parameters(self):
# Override
def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
# """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration."""
# command = [
# "nougat",
# str(input_path),
# # nougat requires the argument output path to be a directory, not file, so we need to handle it here
# "-o",
# str(output_path.parent),
# "--no-skipping",
# "--model",
# self.model_tag,
# "--batchsize",
# str(self.batch_size),
# ]
# try:
# result = subprocess.run(command, check=False, capture_output=True, text=True)
# self._logger.info(f"Output: {result.stdout}")
# self._logger.info(f"Errors: {result.stderr}")
# if result.returncode != 0:
# self._logger.error(f"Command exited with a non-zero status: {result.returncode}")
# # Now change the file name of generated mmd file to align with the expected md file path from base converter
command = [
"nougat",
str(input_path),
# nougat requires the argument output path to be a directory, not file, so we need to handle it here
"-o",
str(output_path.parent),
"--no-skipping",
"--model",
self.model_tag,
"--batchsize",
str(self.batch_size),
]
try:
result = subprocess.run(command, check=False, capture_output=True, text=True)
self._logger.info(f"Output: {result.stdout}")
self._logger.info(f"Errors: {result.stderr}")
if result.returncode != 0:
self._logger.error(f"Command exited with a non-zero status: {result.returncode}")
except Exception as e:
self._logger.error(f"An error occurred: {str(e)}")
raise

# Now change the file name of generated mmd file to align with the expected md file path from base converter
output_mmd_path = output_path.with_suffix(".mmd")
# Rename it to `md` file
target = output_path.with_suffix(".md")
Expand All @@ -65,18 +69,19 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:

def _to_page(self, input_path: Path, output_path: Path) -> Page:
"""Perform Markdown to Page conversion."""
parent = input_path.parent
stem = input_path.stem
input_path = self._to_markdown(input_path, output_path)
print(input_path)
try:
input_path = self._to_markdown(input_path, output_path)
except Exception as e:
self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
raise

output_path.parent.mkdir(parents=True, exist_ok=True)

filetype = input_path.suffix.split(".")[1]
filetype = input_path.suffix.lstrip('.')
with open(input_path, "r") as input_file:
text = input_file.read()
metadata = parent / (stem+"_metadata.yaml")
with open(metadata, "r") as metadata_file:
metadata_content = yaml.safe_load(metadata_file)
url = metadata_content.get("URL", None)
return Page(content={'text': text}, filetype=filetype, page_url=url)

metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
metadata_content = self._read_metadata(metadata_path)
url = metadata_content.get("URL")
return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
1 change: 1 addition & 0 deletions rag/file_conversion_router/conversion/rst_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from rst_to_myst import rst_to_myst
import yaml


class RstConverter(BaseConverter):
def __init__(self):
super().__init__()
Expand Down
1 change: 1 addition & 0 deletions rag/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,4 @@ torch==2.3.0
tqdm==4.66.2
transformers==4.38.2
voyageai==0.2.2
rst_to_myst==0.4.0
2 changes: 1 addition & 1 deletion tests/test_rag/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ def helper_unit_test_on_converter(input_path: str, expected_output_paths: List[s
output_folder = tmp_path / input_path.stem
converter.convert(input_path, output_folder)

for idx, suffix in enumerate([".md", ".md.pkl", ".md.tree.txt"]):
for idx, suffix in enumerate([".md", ".pkl"]):
output_file_path = output_folder / f"{input_path.stem}{suffix}"
assert output_file_path.exists(), f"File {output_file_path} does not exist."
assert compare_files(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ def test_pdf_converter_on_task_manager(input_path: str, expected_output_paths: L
pytest.fail(f"Conversion failed with an exception: {e}")

# Now that the task is complete, check the file against the expected output
for idx, suffix in enumerate([".md", ".md.pkl", ".md.tree.txt"]):
for idx, suffix in enumerate([".md", ".pkl"]):
output_file_path = output_folder / f"{input_path.stem}{suffix}"
assert output_file_path.exists(), f"File {output_file_path} does not exist."
assert compare_files(
Expand Down

0 comments on commit 5090f4c

Please sign in to comment.