Fix converter bug and passed tests

augcog · Jul 14, 2024 · 5090f4c · 5090f4c
1 parent 10284ea
commit 5090f4c
Show file tree

Hide file tree

Showing 8 changed files with 95 additions and 53 deletions.
diff --git a/rag/file_conversion_router/classes/page.py b/rag/file_conversion_router/classes/page.py
@@ -2,6 +2,8 @@
 from rag.file_conversion_router.classes.chunk import Chunk
 import tiktoken
 import pickle
+
+
 class Page:
     def __init__(self, pagename: str, content: dict, filetype: str, page_url: str = ""):
         """
@@ -31,6 +33,7 @@ def recursive_separate(self, response: str, token_limit: int = 400) -> list:
         Returns:
             list: List of separated text chunks.
         """
+
         def token_size(sentence: str) -> int:
             encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
             return len(encoding.encode(sentence))
@@ -112,8 +115,13 @@ def count_consecutive_hashes(s):
             headers_content.append((curheader, current_content))
 
         return headers_content
+
     def page_seperate_to_segments(self) -> None:
         self.segments = [i for i in self.extract_headers_and_content(self.content['text'])]
+        if not self.segments:
+            # LEVEL 0 for no header found
+            self.segments = [(("NO ANY HEADER DETECTED", 0),
+                              self.content['text'])]
 
     def print_header_tree(self):
         result = ""
@@ -215,13 +223,14 @@ def generate_hyperlink_header(header_text):
             hyperlink_header = lower_text.replace(' ', '-')
 
             return hyperlink_header
+
         # seperate with recursive seperate
         for i in self.tree_segments:
             content_chunks = self.recursive_separate(i['Segment_print'], 400)
             for count, content_chunk in enumerate(content_chunks):
                 headers = i['Page_path']
                 urls = [f"{self.page_url}#{generate_hyperlink_header(header)}" for header in headers]
-                page_path = ' > '.join(f"{item} (h{i+1})" for i, item in enumerate(i['Page_path'])) + f" ({count})"
+                page_path = ' > '.join(f"{item} (h{i + 1})" for i, item in enumerate(i['Page_path'])) + f" ({count})"
                 self.chunks.append(Chunk(page_path, content_chunk, urls))
         return self.chunks
 
@@ -255,4 +264,3 @@ def chunks_to_pkl(self, output_path: str) -> None:
         """
         with open(output_path, "wb") as f:
             pickle.dump(self.chunks, f)
-
diff --git a/rag/file_conversion_router/conversion/base_converter.py b/rag/file_conversion_router/conversion/base_converter.py
@@ -7,12 +7,12 @@
 from shutil import copy2
 from threading import Lock
 from typing import Dict, List, Union
+import yaml
 
 from rag.file_conversion_router.utils.logger import conversion_logger, logger
-from rag.file_conversion_router.utils.markdown_parser import MarkdownParser
 from rag.file_conversion_router.utils.utils import calculate_hash, ensure_path
 from rag.file_conversion_router.classes.page import Page
-from rag.file_conversion_router.classes.vidpage import VidPage
+
 
 class BaseConverter(ABC):
     """Base classes for all file type converters.
@@ -33,7 +33,6 @@ def __init__(self):
         self._md_parser = None
 
         self._md_path = None
-        self._tree_txt_path = None
         self._pkl_path = None
 
         self._logger = logger
@@ -105,35 +104,59 @@ def _setup_output_paths(self, input_path: Union[str, Path], output_folder: Union
         # TODO: current MarkdownParser does not support custom output paths,
         #  below paths are only used for caching purposes at the moment,
         #  since the markdown parser generates below file paths by default
-        self._tree_txt_path = ensure_path(output_folder / f"{input_path.stem}.md.tree.txt")
-        self._pkl_path = ensure_path(output_folder / f"{input_path.stem}.md.pkl")
+        self._pkl_path = ensure_path(output_folder / f"{input_path.stem}.pkl")
 
     def _convert_and_cache(self, input_path: Path, output_folder: Path, file_hash: str) -> List[Path]:
         self._setup_output_paths(input_path, output_folder)
         # This method embeds the abstract method `_to_markdown`, which needs to be implemented by the child classes.
         _, conversion_time = self._perform_conversion(input_path, output_folder)
-        paths = [self._md_path, self._tree_txt_path, self._pkl_path]
+        paths = [self._md_path, self._pkl_path]
         ConversionCache.set_cached_paths_and_time(file_hash, paths, conversion_time)
 
     def _use_cached_files(self, cached_paths: List[Path], output_folder: Path) -> None:
         """Use cached files and copy them to the specified output folder."""
         output_folder = ensure_path(output_folder)
         output_folder.mkdir(parents=True, exist_ok=True)
 
-        md_path, tree_txt_path, pkl_path = cached_paths
+        md_path, pkl_path = cached_paths
         correct_file_name = self._md_path.stem
-        for path, suffix in zip((md_path, tree_txt_path, pkl_path), (".md", ".md.tree.txt", ".md.pkl")):
+        for path, suffix in zip((md_path, pkl_path), (".md", ".pkl")):
             des_path = Path(copy2(path, output_folder))
             des_path.rename(output_folder / f"{correct_file_name}{suffix}")
             self._logger.info(f"Copied cached file from {path} to {des_path}.")
 
+    def _read_metadata(self, metadata_path: Path) -> dict:
+        """Read metadata from file or return mocked data if file doesn't exist."""
+        if metadata_path.exists():
+            try:
+                with open(metadata_path, "r") as metadata_file:
+                    return yaml.safe_load(metadata_file)
+            except Exception as e:
+                self._logger.error(f"Error reading metadata file: {str(e)}")
+                return self._get_mocked_metadata()
+        else:
+            self._logger.warning(f"Metadata file not found: {metadata_path}. Using mocked metadata.")
+            return self._get_mocked_metadata()
+
+    @staticmethod
+    def _get_mocked_metadata() -> dict:
+        """Return mocked metadata when the actual metadata file is missing."""
+        return {
+            "URL": "URL_NOT_AVAILABLE",
+            # Add other mocked metadata fields as needed
+        }
+
     @conversion_logger
     def _perform_conversion(self, input_path: Path, output_folder: Path) -> None:
         """Perform the file conversion process."""
-        page = self._convert_to_page(input_path, output_folder)[0]
+        if not output_folder.exists():
+            output_folder.mkdir(parents=True, exist_ok=True)
+            logger.warning(f"Output folder did not exist, it's now created: {output_folder}")
+        filename = output_folder.stem
+        pkl_output_path = output_folder / f"{filename}.pkl"
+        page = self._convert_to_page(input_path, pkl_output_path)[0]
         page.to_chunk()
-        pkl_file = output_folder.with_suffix(".pkl")
-        page.chunks_to_pkl(str(pkl_file))
+        page.chunks_to_pkl(str(pkl_output_path))
 
     @abstractmethod
     def _to_page(self, input_path: Path, output_path: Path) -> Page:

diff --git a/rag/file_conversion_router/conversion/md_converter.py b/rag/file_conversion_router/conversion/md_converter.py
@@ -26,16 +26,20 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
         return output_path
     def _to_page(self, input_path: Path, output_path: Path) -> Page:
         """Perform Markdown to Page conversion."""
+        try:
+            input_path = self._to_markdown(input_path, output_path)
+        except Exception as e:
+            self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
+            raise
+
         output_path.parent.mkdir(parents=True, exist_ok=True)
-        parent = input_path.parent
-        stem = input_path.stem
-        filetype = input_path.suffix.split(".")[1]
+
+        filetype = input_path.suffix.lstrip('.')
         with open(input_path, "r") as input_file:
             text = input_file.read()
-        metadata = parent / (stem+"_metadata.yaml")
-        with open(metadata, "r") as metadata_file:
-            metadata_content = yaml.safe_load(metadata_file)
-        url = metadata_content["URL"]
-        page = Page(pagename=stem, content={'text': text}, filetype=filetype, page_url=url)
-        return page
+
+        metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
+        metadata_content = self._read_metadata(metadata_path)
+        url = metadata_content.get("URL")
+        return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
 
diff --git a/rag/file_conversion_router/conversion/pdf_converter.py b/rag/file_conversion_router/conversion/pdf_converter.py
@@ -33,25 +33,29 @@ def _validate_parameters(self):
     # Override
     def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
         # """Perform PDF to Markdown conversion using Nougat with the detected hardware configuration."""
-        # command = [
-        #     "nougat",
-        #     str(input_path),
-        #     # nougat requires the argument output path to be a directory, not file, so we need to handle it here
-        #     "-o",
-        #     str(output_path.parent),
-        #     "--no-skipping",
-        #     "--model",
-        #     self.model_tag,
-        #     "--batchsize",
-        #     str(self.batch_size),
-        # ]
-        # try:
-        #     result = subprocess.run(command, check=False, capture_output=True, text=True)
-        #     self._logger.info(f"Output: {result.stdout}")
-        #     self._logger.info(f"Errors: {result.stderr}")
-        #     if result.returncode != 0:
-        #         self._logger.error(f"Command exited with a non-zero status: {result.returncode}")
-        #     # Now change the file name of generated mmd file to align with the expected md file path from base converter
+        command = [
+            "nougat",
+            str(input_path),
+            # nougat requires the argument output path to be a directory, not file, so we need to handle it here
+            "-o",
+            str(output_path.parent),
+            "--no-skipping",
+            "--model",
+            self.model_tag,
+            "--batchsize",
+            str(self.batch_size),
+        ]
+        try:
+            result = subprocess.run(command, check=False, capture_output=True, text=True)
+            self._logger.info(f"Output: {result.stdout}")
+            self._logger.info(f"Errors: {result.stderr}")
+            if result.returncode != 0:
+                self._logger.error(f"Command exited with a non-zero status: {result.returncode}")
+        except Exception as e:
+            self._logger.error(f"An error occurred: {str(e)}")
+            raise
+
+        # Now change the file name of generated mmd file to align with the expected md file path from base converter
         output_mmd_path = output_path.with_suffix(".mmd")
         # Rename it to `md` file
         target = output_path.with_suffix(".md")
@@ -65,18 +69,19 @@ def _to_markdown(self, input_path: Path, output_path: Path) -> Path:
 
     def _to_page(self, input_path: Path, output_path: Path) -> Page:
         """Perform Markdown to Page conversion."""
-        parent = input_path.parent
-        stem = input_path.stem
-        input_path = self._to_markdown(input_path, output_path)
-        print(input_path)
+        try:
+            input_path = self._to_markdown(input_path, output_path)
+        except Exception as e:
+            self._logger.error(f"An error occurred during markdown conversion: {str(e)}")
+            raise
+
         output_path.parent.mkdir(parents=True, exist_ok=True)
 
-        filetype = input_path.suffix.split(".")[1]
+        filetype = input_path.suffix.lstrip('.')
         with open(input_path, "r") as input_file:
             text = input_file.read()
-        metadata = parent / (stem+"_metadata.yaml")
-        with open(metadata, "r") as metadata_file:
-            metadata_content = yaml.safe_load(metadata_file)
-        url = metadata_content.get("URL", None)
-        return Page(content={'text': text}, filetype=filetype, page_url=url)
 
+        metadata_path = input_path.with_name(f"{input_path.stem}_metadata.yaml")
+        metadata_content = self._read_metadata(metadata_path)
+        url = metadata_content.get("URL")
+        return Page(pagename=input_path.stem, content={'text': text}, filetype=filetype, page_url=url)
diff --git a/rag/file_conversion_router/conversion/rst_converter.py b/rag/file_conversion_router/conversion/rst_converter.py
@@ -5,6 +5,7 @@
 from rst_to_myst import rst_to_myst
 import yaml
 
+
 class RstConverter(BaseConverter):
     def __init__(self):
         super().__init__()

diff --git a/rag/requirements.txt b/rag/requirements.txt
@@ -24,3 +24,4 @@ torch==2.3.0
 tqdm==4.66.2
 transformers==4.38.2
 voyageai==0.2.2
+rst_to_myst==0.4.0
diff --git a/tests/test_rag/conftest.py b/tests/test_rag/conftest.py
@@ -109,7 +109,7 @@ def helper_unit_test_on_converter(input_path: str, expected_output_paths: List[s
     output_folder = tmp_path / input_path.stem
     converter.convert(input_path, output_folder)
 
-    for idx, suffix in enumerate([".md", ".md.pkl", ".md.tree.txt"]):
+    for idx, suffix in enumerate([".md", ".pkl"]):
         output_file_path = output_folder / f"{input_path.stem}{suffix}"
         assert output_file_path.exists(), f"File {output_file_path} does not exist."
         assert compare_files(

diff --git a/tests/test_rag/test_file_conversion_router/test_services/test_task_manager.py b/tests/test_rag/test_file_conversion_router/test_services/test_task_manager.py
@@ -27,7 +27,7 @@ def test_pdf_converter_on_task_manager(input_path: str, expected_output_paths: L
         pytest.fail(f"Conversion failed with an exception: {e}")
 
     # Now that the task is complete, check the file against the expected output
-    for idx, suffix in enumerate([".md", ".md.pkl", ".md.tree.txt"]):
+    for idx, suffix in enumerate([".md", ".pkl"]):
         output_file_path = output_folder / f"{input_path.stem}{suffix}"
         assert output_file_path.exists(), f"File {output_file_path} does not exist."
         assert compare_files(