Skip to content

Commit

Permalink
Refactor readers
Browse files Browse the repository at this point in the history
  • Loading branch information
mawandm committed Jun 30, 2024
1 parent 9f650a7 commit ca92654
Showing 1 changed file with 65 additions and 87 deletions.
152 changes: 65 additions & 87 deletions nesis/rag/core/components/ingest/readers.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from fsspec import AbstractFileSystem
from llama_index.core import Document
from llama_index.core.readers.base import BaseReader
from unstructured.documents.elements import Element
from unstructured.partition.image import partition_image
from unstructured.partition.pdf import partition_pdf
from unstructured.partition.xlsx import partition_xlsx
Expand Down Expand Up @@ -36,43 +37,83 @@ def _clean_metadata(metadata: Dict, exclusion_list: List[str] = None) -> Dict:
return metadata_copy


class ExcelReader(BaseReader):
class BaseFileReader(BaseReader):
"""
A simple MS Excel file reader. Uses pandas in the background
A simple PDF file reader.
"""

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config = config or {}
self._metadata_exclusion_list = self._config.get("metadata_exclusion_list") or [
"file_directory",
"filename",
self._metadata_exclusion_list: list[str] = (
self._config.get("metadata_exclusion_list")
or [
"file_directory",
"filename",
]
) + [
"text",
"file_name",
"coordinates",
"embedding",
"metadata_template",
"metadata_seperator",
"text_template",
"excluded_embed_metadata_keys",
"excluded_llm_metadata_keys",
"relationships",
"start_char_idx",
"end_char_idx",
]

def load_data(
def load_documents(
self,
file: Path,
elements: List[Element],
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
elements = partition_xlsx(file.absolute())
documents: List[Document] = []

for element in elements:
element_dict = element.to_dict()
if element_dict["text"] == "":
continue
element_text = element_dict["text"]
metadata = _clean_metadata(
{
**{
key: val
for key, val in element_dict.items()
if key not in ["text", "metadata"]
},
**element_dict["metadata"],
},
exclusion_list=self._metadata_exclusion_list,
)
document = Document(
text=element_dict["text"],
text=element_text,
metadata={
**(extra_info or {}),
**_clean_metadata(
element_dict["metadata"],
exclusion_list=self._metadata_exclusion_list,
),
**metadata,
},
)
documents.append(document)
return documents


class ExcelReader(BaseFileReader):

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
BaseFileReader.__init__(self, config=config)

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
elements = partition_xlsx(file.absolute())
return self.load_documents(elements=elements, extra_info=extra_info)


class OdsReader(BaseReader):
"""
A simple open document spreadsheet reader
Expand All @@ -88,41 +129,24 @@ def load_data(
return [Document(text=data, metadata=extra_info or {})]


class ImageReader(BaseReader):
class ImageReader(BaseFileReader):
"""
The llamaindex reader doesn't return any text, so we use unstructured.io instead of llamaindex ImageReader.
The llamaindex reader doesn't return any text, so we use unstructured.io instead.
"""

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config = config or {}
self._metadata_exclusion_list = self._config.get("metadata_exclusion_list") or [
"file_directory",
"filename",
]
BaseFileReader.__init__(self, config=config)

def load_data(
self,
file: Path,
extra_info: Optional[Dict] = None,
fs: Optional[AbstractFileSystem] = None,
) -> List[Document]:
elements = partition_image(file.absolute())
documents: List[Document] = []

for element in elements:
element_dict = element.to_dict()
document = Document(
text=element_dict["text"],
metadata={
**(extra_info or {}),
**_clean_metadata(
element_dict["metadata"],
exclusion_list=self._metadata_exclusion_list,
),
},
)
documents.append(document)
return documents
elements = partition_image(
file.absolute(), strategy="hi_res", infer_table_structure=True
)
return self.load_documents(elements=elements, extra_info=extra_info)


class TiffReader(BaseReader):
Expand Down Expand Up @@ -170,33 +194,13 @@ def load_data(
return documents


class PdfReader(BaseReader):
class PdfReader(BaseFileReader):
"""
A simple PDF file reader.
"""

def __init__(self, config: Optional[Dict[str, Any]] = None) -> None:
self._config = config or {}
self._metadata_exclusion_list: list[str] = (
self._config.get("metadata_exclusion_list")
or [
"file_directory",
"filename",
]
) + [
"text",
"file_name",
"coordinates",
"embedding",
"metadata_template",
"metadata_seperator",
"text_template",
"excluded_embed_metadata_keys",
"excluded_llm_metadata_keys",
"relationships",
"start_char_idx",
"end_char_idx",
]
BaseFileReader.__init__(self, config=config)

def load_data(
self,
Expand All @@ -207,30 +211,4 @@ def load_data(
elements = partition_pdf(
file.absolute(), strategy="hi_res", infer_table_structure=True
)
documents: List[Document] = []

for element in elements:
element_dict = element.to_dict()
if element_dict["text"] == "":
continue
element_text = element_dict["text"]
metadata = _clean_metadata(
{
**{
key: val
for key, val in element_dict.items()
if key not in ["text", "metadata"]
},
**element_dict["metadata"],
},
exclusion_list=self._metadata_exclusion_list,
)
document = Document(
text=element_text,
metadata={
**(extra_info or {}),
**metadata,
},
)
documents.append(document)
return documents
return self.load_documents(elements=elements, extra_info=extra_info)

0 comments on commit ca92654

Please sign in to comment.