-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Wrote text ingestors, still incomplete tests
- Loading branch information
Showing
5 changed files
with
106 additions
and
11 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
from enum import Enum | ||
|
||
|
||
class IngestorError(Enum): | ||
EOF = "End of File" | ||
ETCD = "ETCD Error" | ||
NETWORK = "Network Error" | ||
TIMEOUT = "Timeout" | ||
UNKNOWN = "Unknown Error" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,26 +1,29 @@ | ||
|
||
from typing import Optional | ||
from querent.config.ingestor_config import IngestorBackend | ||
from querent.ingestors.base_ingestor import BaseIngestor | ||
from querent.ingestors.ingestor_factory import IngestorFactory, UnsupportedIngestor | ||
from querent.ingestors.pdfs.pdf_ingestor_v1 import PdfIngestorFactory | ||
from querent.ingestors.texts.text_ingestor import TextIngestorFactory | ||
|
||
|
||
class IngestorFactoryManager: | ||
def __init__(self): | ||
self.ingestor_factories = { | ||
IngestorBackend.PDF.value: PdfIngestorFactory(), | ||
#Ingestor.TEXT.value: TextIngestor(), | ||
IngestorBackend.TEXT.value: TextIngestorFactory() | ||
# Ingestor.TEXT.value: TextIngestor(), | ||
# Add more mappings as needed | ||
} | ||
|
||
async def get_factory(self, file_extension: str) -> IngestorFactory: | ||
return self.ingestor_factories.get(file_extension.lower(), UnsupportedIngestor("Unsupported file extension")) | ||
return self.ingestor_factories.get( | ||
file_extension.lower(), UnsupportedIngestor("Unsupported file extension") | ||
) | ||
|
||
async def get_ingestor(self, file_extension: str) -> Optional[BaseIngestor]: | ||
factory = self.get_factory(file_extension) | ||
return factory.create(file_extension) | ||
|
||
async def supports(self, file_extension: str) -> bool: | ||
factory = self.get_factory(file_extension) | ||
return factory.supports(file_extension) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
from typing import List, AsyncGenerator | ||
from querent.common.types.collected_bytes import CollectedBytes | ||
from querent.ingestors.base_ingestor import BaseIngestor | ||
from querent.ingestors.ingestor_factory import IngestorFactory | ||
from querent.processors.async_processor import AsyncProcessor | ||
from querent.config.ingestor_config import IngestorBackend | ||
|
||
|
||
class TextIngestorFactory(IngestorFactory): | ||
SUPPORTED_EXTENSIONS = {"txt"} | ||
|
||
async def supports(self, file_extension: str) -> bool: | ||
return file_extension.lower() in self.SUPPORTED_EXTENSIONS | ||
|
||
async def create( | ||
self, file_extension: str, processors: List[AsyncProcessor] | ||
) -> BaseIngestor: | ||
if not self.supports(file_extension): | ||
return None | ||
|
||
return TextIngestor(processors) | ||
|
||
|
||
class TextIngestor(BaseIngestor): | ||
def __init__(self, processors: List[AsyncProcessor]): | ||
self.processors = processors | ||
super.__init__(IngestorBackend.TEXT) | ||
|
||
async def ingest( | ||
self, poll_function: AsyncGenerator[CollectedBytes, None] | ||
) -> AsyncGenerator[List[str], None]: | ||
try: | ||
collected_bytes = b"" | ||
current_file = None | ||
|
||
async for chunk_bytes in poll_function: | ||
if chunk_bytes.is_error(): | ||
continue | ||
|
||
if chunk_bytes.file != current_file: | ||
if current_file: | ||
text = await self.extract_and_process_text( | ||
CollectedBytes(file=current_file, data=collected_bytes) | ||
) | ||
yield text | ||
|
||
collected_bytes = b"" | ||
current_file = chunk_bytes.file | ||
|
||
collected_bytes += chunk_bytes.data | ||
|
||
if current_file: | ||
text = await self.extract_and_process_text( | ||
CollectedBytes(file=current_file, data=collected_bytes) | ||
) | ||
yield text | ||
|
||
except Exception as e: | ||
yield [] | ||
|
||
async def extract_and_process_text( | ||
self, collected_bytes: CollectedBytes | ||
) -> List[str]: | ||
text = await self.extract_text_from_file(collected_bytes) | ||
return await self.process_data(text=text) | ||
|
||
async def extract_text_from_file(collected_bytes: CollectedBytes) -> str: | ||
text = collected_bytes.data.decode("utf-8") | ||
return text | ||
|
||
async def process_data(self, text: str) -> List[str]: | ||
processed_data = text | ||
for processor in self.processors: | ||
processed_data = await processor.process(processed_data) | ||
return processed_data |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters