Skip to content

Commit

Permalink
Refactor text processing in BestRAG and update version to 0.3.0
Browse files Browse the repository at this point in the history
Signed-off-by: samadpls <abdulsamadsid1@gmail.com>
  • Loading branch information
samadpls committed Nov 30, 2024
1 parent 33d7a2f commit 31884c0
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 1 deletion.
7 changes: 7 additions & 0 deletions bestrag/best_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,13 @@ def _clean_text(self, text: str) -> str:
text = text.replace('\n', ' ')
text = re.sub(r'\s{2,}', ' ', text)
text = re.sub(r'(\d+\.)\s', r'\n\1 ', text)
text = re.sub(r'[●■○]', '', text)
text = re.sub(r'[""''«»]', '"', text)
text = re.sub(r'[–—−]', '-', text)
text = re.sub(r'[^\x00-\x7F]+', '', text)
text = re.sub(r'[\u200B-\u200D\uFEFF]', '', text)
text = re.sub(r'\s+', ' ', text)

return text.strip()

def _get_dense_embedding(self, text: str):
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

setup(
name="bestrag",
version="0.2.1",
version="0.3.0",
description="bestrag: Library for storing and searching document embeddings in a Qdrant vector database using hybrid embedding techniques.",
author="samadpls",
author_email="abdulsamadsid1@gmail.com",
Expand Down

0 comments on commit 31884c0

Please sign in to comment.