Skip to content

Commit

Permalink
add custom parser for .doc and .docx
Browse files Browse the repository at this point in the history
  • Loading branch information
nicola-corbellini committed Aug 22, 2023
1 parent 7b65023 commit 98cd01f
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 4 deletions.
9 changes: 6 additions & 3 deletions core/cat/rabbit_hole.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,12 @@
import os
import time
import math
import json
import mimetypes
from typing import List, Union
from urllib.request import urlopen, Request
from urllib.parse import urlparse
from urllib.error import HTTPError

from cat.log import log
from starlette.datastructures import UploadFile
from langchain.docstore.document import Document
from qdrant_client.http import models
Expand All @@ -19,6 +17,9 @@
from langchain.document_loaders.blob_loaders.schema import Blob
from langchain.document_loaders.parsers.html.bs4 import BS4HTMLParser

from cat.log import log
from cat.utils import DocxParser


class RabbitHole:
"""Manages content ingestion. I'm late... I'm late!
Expand All @@ -31,7 +32,9 @@ def __init__(self, cat):
"application/pdf": PDFMinerParser(),
"text/plain": TextParser(),
"text/markdown": TextParser(),
"text/html": BS4HTMLParser()
"text/html": BS4HTMLParser(),
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": DocxParser(),
"application/msword": DocxParser()
}

def ingest_memory(self, file: UploadFile):
Expand Down
38 changes: 37 additions & 1 deletion core/cat/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
"""Various utiles used from the projects."""

from abc import ABC
from typing import Iterator
from datetime import timedelta

from unstructured.partition.auto import partition
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseBlobParser
from langchain.document_loaders.blob_loaders.schema import Blob

def to_camel_case(text :str ) -> str:

def to_camel_case(text: str) -> str:
"""Format string to camel case.
Takes a string of words separated by either hyphens or underscores and returns a string of words in camel case.
Expand Down Expand Up @@ -67,3 +74,32 @@ def verbal_timedelta(td: timedelta) -> str:
return "{} ago".format(abs_delta)
else:
return "{} ago".format(abs_delta)


class DocxParser(BaseBlobParser, ABC):
"""Custom parser for `.docx` and `.doc` files."""
def lazy_parse(self, blob: Blob) -> Iterator[Document]:
"""This method overrides the `BaseBlobParser` to lazy parse Microsoft `.docx` and `.doc` files.
Parameters
----------
blob: Blob
Raw data the `RabbitHole` receives when a file is ingested.
Returns
-------
Iterator[Document]
Iterator with the parsed text converted to Langchain documents.
"""

# Load raw data as a file-like object with binary content
with blob.as_bytes_io() as file:
# Get the file elements using Unstructured
elements = partition(file=file)

# Retrieve the text from each element and format it in a text
elements = [e.text for e in elements]
text = "\n".join(elements)

yield Document(page_content=text, metadata={})

0 comments on commit 98cd01f

Please sign in to comment.