Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added Applications/.DS_Store
Binary file not shown.
Binary file not shown.
7 changes: 7 additions & 0 deletions Applications/docx-tracked-changes/docx-remote-testing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from tensorlake.runtime import run_remote_application

# Deploy and run at scale
changes = run_remote_application(
analyze_contract_revisions,
contract_urls=contracts
)
243 changes: 243 additions & 0 deletions Applications/docx-tracked-changes/docx-tracked-changes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,243 @@
import os
from typing import List
from tensorlake.applications import Image, application, function, cls
from tensorlake.documentai import DocumentAI

# Define the runtime environment
image = (
Image(base_image="python:3.11-slim", name="docx-example")
.run("pip install tensorlake beautifulsoup4")
)

@application()
@function(
secrets=["TENSORLAKE_API_KEY"],
image=image
)
def analyze_contract_revisions(contract_urls: List[str]) -> None:
"""
Main application that processes contracts with tracked changes
and loads analysis results
"""
print(f"[DEBUG] Starting analysis of {len(contract_urls)} contracts")

# Process each contract in parallel
print("[DEBUG] Processing contracts in parallel...")
results = process_single_contract.map(contract_urls)

# Collect and analyze all results
all_changes = []
for result in results:
all_changes.extend(result)

print(f"Processed {len(contract_urls)} contracts")
print(f"Found {len(all_changes)} tracked changes total")

return all_changes


@function(
secrets=["TENSORLAKE_API_KEY"],
image=image
)
def process_single_contract(url: str) -> List[dict]:
"""
Process a single contract and extract tracked changes with locations
"""
print(f"[DEBUG] Processing contract: {url}")
from bs4 import BeautifulSoup

doc_ai = DocumentAI()

# Parse the DOCX file
print("[DEBUG] Parsing DOCX file...")
try:
result = doc_ai.parse_and_wait(file=url)
except Exception as e:
print(f"[ERROR] Parse failed: {e}")
# Handle tracked changes manually from markdown if needed
return []
print("[DEBUG] DOCX parsing complete")

changes = []

# Extract tracked changes from each chunk
print(f"[DEBUG] Processing {len(result.chunks)} document chunks")
for chunk_idx, chunk in enumerate(result.chunks, 1):
print(f"[DEBUG] Processing chunk {chunk_idx}/{len(result.chunks)}")
soup = BeautifulSoup(chunk.markdown, 'html.parser')

# Find insertions
insertions = soup.find_all('ins')
print(f"[DEBUG] Found {len(insertions)} insertions in chunk {chunk_idx}")
for ins in insertions:
text = ins.get_text()
print(f"[DEBUG] Processing insertion: '{text[:50]}{'...' if len(text) > 50 else ''}'")
changes.append({
'type': 'insertion',
'text': text,
'context': get_surrounding_context(ins),
'page': chunk.page_number if hasattr(chunk, 'page_number') else None
})

# Find deletions
deletions = soup.find_all('del')
print(f"[DEBUG] Found {len(deletions)} deletions in chunk {chunk_idx}")
for del_tag in deletions:
text = del_tag.get_text()
print(f"[DEBUG] Processing deletion: '{text[:50]}{'...' if len(text) > 50 else ''}'")
changes.append({
'type': 'deletion',
'text': text,
'context': get_surrounding_context(del_tag),
'page': chunk.page_number if hasattr(chunk, 'page_number') else None
})

# Find comments
comments = soup.find_all('span', class_='comment')
print(f"[DEBUG] Found {len(comments)} comments in chunk {chunk_idx}")
for comment in comments:
note = comment.get('data-note')
print(f"[DEBUG] Processing comment: '{note[:50]}{'...' if len(note) > 50 else ''}'")
changes.append({
'type': 'comment',
'text': note,
'highlighted_text': comment.get_text(),
'page': chunk.page_number if hasattr(chunk, 'page_number') else None
})

# Also extract spatial information for key clauses
for page in result.pages:
for fragment in page.page_fragments:
if fragment.fragment_type == 'text' and is_key_clause(fragment.content):
# Find any tracked changes in this fragment
fragment_changes = [
c for c in changes
if c.get('page') == page.page_number and
c.get('text', '') in fragment.content
]

for change in fragment_changes:
change['bbox'] = fragment.bbox
change['reading_order'] = fragment.reading_order

return changes


@function(image=image)
def get_surrounding_context(tag, chars=100):
"""Extract surrounding text context for a tracked change"""
text = tag.parent.get_text() if tag.parent else ""
tag_text = tag.get_text()

if tag_text in text:
start = max(0, text.index(tag_text) - chars)
end = min(len(text), text.index(tag_text) + len(tag_text) + chars)
return text[start:end]

return text[:chars]


@function(image=image)
def is_key_clause(text: str) -> bool:
"""Identify if text contains a key contract clause"""
key_terms = [
'indemnification', 'liability', 'warranty', 'termination',
'confidentiality', 'intellectual property', 'force majeure',
'governing law', 'dispute resolution', 'payment terms'
]

text_lower = text.lower()
return any(term in text_lower for term in key_terms)


@application()
@function(
secrets=["TENSORLAKE_API_KEY"],
image=image
)
def query_contract_changes(contract_tuple: tuple[str, str]) -> str:
"""
Query a contract about specific changes using an LLM with full context
"""
print(f"[DEBUG] Starting contract query with parameters: {contract_tuple}")
doc_ai = DocumentAI()

# Parse the contract
print("[DEBUG] Parsing contract for querying...")
result = doc_ai.parse_and_wait(file=contract_tuple[0])
print("[DEBUG] Contract parsing complete")

# Build context from tracked changes and spatial data
context = build_contract_context(result)

# Use an LLM to answer the question (pseudo-code)
# In production, you'd call your LLM API here
answer = f"""
Based on the tracked changes in the document:

Question: { contract_tuple[1] }

Analysis:
{context}
"""

return answer


@function(image=image)
def build_contract_context(result) -> str:
"""Build a comprehensive context string with tracked changes and locations"""
from bs4 import BeautifulSoup

context_parts = []

for chunk in result.chunks:
soup = BeautifulSoup(chunk.markdown, 'html.parser')

# Extract insertions with page numbers
for ins in soup.find_all('ins'):
context_parts.append(
f"INSERTION: Added '{ins.get_text()}' "
f"(Page {chunk.page_number if hasattr(chunk, 'page_number') else 'unknown'})"
)

# Extract deletions with page numbers
for del_tag in soup.find_all('del'):
context_parts.append(
f"DELETION: Removed '{del_tag.get_text()}' "
f"(Page {chunk.page_number if hasattr(chunk, 'page_number') else 'unknown'})"
)

# Extract comments
for comment in soup.find_all('span', class_='comment'):
context_parts.append(
f"COMMENT: '{comment.get('data-note')}' on text '{comment.get_text()}' "
f"(Page {chunk.page_number if hasattr(chunk, 'page_number') else 'unknown'})"
)

return "\n".join(context_parts)

if __name__ == "__main__":
from tensorlake.applications import run_local_application

# Analyze multiple contracts
contracts_urls = [
"https://pub-226479de18b2493f96b64c6674705dd8.r2.dev/Commercial%20Property%20Insurance%20Claim%20Assessment%20Report.docx"
]

changes = run_local_application(
analyze_contract_revisions,
contracts_urls
)

print(f"Found {changes} tracked changes across all contracts")

# Query a specific contract
answer = run_local_application(
query_contract_changes,
("https://pub-226479de18b2493f96b64c6674705dd8.r2.dev/Commercial%20Property%20Insurance%20Claim%20Assessment%20Report.docx",
"What changes did opposing counsel make to the liability section?")
)

print(answer)
4 changes: 4 additions & 0 deletions Applications/docx-tracked-changes/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
tensorlake>=0.1.0



Loading