text_processing.py

import re
import pymupdf
from nltk.tokenize import sent_tokenize
from docx.api import Document
from pptx import Presentation
from bs4 import BeautifulSoup
import pypandoc

def clean_text(text):
    text = re.sub(r"[^\x00-\x7F]", " ", text)
    text = re.sub(r"[\n]", " ", text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'[“”]', '"', text)
    text = re.sub(r"[‘’]", "'", text)
    text = text.replace('\xad', '')
    text = re.sub(r'[‒–—―]', '-', text)
    return text

# Function to create text chunks
def segment_text(text, max_segment_length=700, batch_size=7):
    sentences = sent_tokenize(text)
    segments = []
    current_segment = ""
    
    for sentence in sentences:
        if len(current_segment) + len(sentence) <= max_segment_length:
            current_segment += sentence + " "
        else:
            segments.append(current_segment.strip())
            current_segment = sentence + " "
    
    if current_segment:
        segments.append(current_segment.strip())
    
    # Create batches
    batches = [segments[i:i + batch_size] for i in range(0, len(segments), batch_size)]
    return batches 

def get_pdf_text(pdf_file):
    doc = pymupdf.open(stream=pdf_file.read(), filetype="pdf")
    text = ""
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)
        text += page.get_text()
    return text

# Function to get text from a DOCX file
def get_doc_text(doc_files):
    doc = Document(doc_files)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

# Function to get text from a PPTX file
def get_ppt_text(ppt_files):
    prs = Presentation(ppt_files)
    text = ""
    for slide in prs.slides:
        for shape in slide.shapes:
            if hasattr(shape, "text"):
                text += shape.text + "\n"
    return text

# Function to get text from HTML files
def get_html_text(html_files):
    text = ""
    for html_file in html_files:
        with open(html_file, 'r', encoding='utf-8') as f:
            soup = BeautifulSoup(f, 'html.parser')
            text += soup.get_text()
    return text

# Function to get text from LaTeX files
def get_latex_text(latex_files):
    text = ""
    for latex_file in latex_files:
        output = pypandoc.convert_file(latex_file, 'plain')
        text += output
    return text

# Function to parse text from a file
def parse_text(file):
    text = file.getvalue().decode("utf-8")
    return text

# Function to get text from uploaded documents
def get_text_from_document(file):
    content = ""
    if file is not None:
        if file.name.endswith('.pdf'):
            content += get_pdf_text(file)
        elif file.name.endswith('.docx') or file.name.endswith('.doc'):
            content += get_doc_text(file)
        elif file.name.endswith('.pptx') or file.name.endswith('.ppt'):
            content += get_ppt_text(file)
        elif file.name.endswith('.html'):
            content += get_html_text(file)
        elif file.name.endswith('.tex'):
            content += get_latex_text(file)
        elif file.name.endswith('.txt'):
            content += parse_text(file)
    return content