Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""Add word count to document record

Revision ID: 4a5b6c7d8e9f
Revises: 3fbb82ea683d
Create Date: 2026-01-11 15:42:00.000000

"""

from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa

from app.linguistic.word_count import count_words


# pylint: disable=E1101

# revision identifiers, used by Alembic.
revision: str = "4a5b6c7d8e9f"
down_revision: Union[str, None] = "3fbb82ea683d"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
# Add word_count column with default value 0
op.add_column(
"document_record",
sa.Column(
"word_count",
sa.Integer(),
server_default="0",
nullable=False,
),
)

# Populate existing records with calculated word counts
connection = op.get_bind()
result = connection.execute(sa.text("SELECT id, source FROM document_record"))
for record_id, source in result:
word_count = count_words(source)
connection.execute(
sa.text(
"UPDATE document_record SET word_count = :word_count WHERE id = :record_id"
),
{"word_count": word_count, "record_id": record_id},
)


def downgrade() -> None:
op.drop_column("document_record", "word_count")
1 change: 1 addition & 0 deletions backend/app/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ class DocumentRecord(Base):
target: Mapped[str] = mapped_column()
approved: Mapped[bool] = mapped_column(default=False)
target_source: Mapped[RecordSource] = mapped_column(nullable=True)
word_count: Mapped[int] = mapped_column(default=0)

document: Mapped["Document"] = relationship(back_populates="records")
comments: Mapped[list["Comment"]] = relationship(
Expand Down
16 changes: 16 additions & 0 deletions backend/app/documents/query.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,22 @@ def get_document_records_count_with_approved(
result = self.__db.execute(stmt).one()
return result[0], result[1]

def get_document_word_count_with_approved(self, doc: Document) -> tuple[int, int]:
"""
Returns tuple of (approved_word_count, total_word_count) for a document.
"""
stmt = select(
func.sum(
case(
(DocumentRecord.approved.is_(True), DocumentRecord.word_count),
else_=0,
)
),
func.sum(DocumentRecord.word_count),
).where(DocumentRecord.document_id == doc.id)
result = self.__db.execute(stmt).one()
return result[0] or 0, result[1] or 0

def get_document_records_count_filtered(
self, doc: Document, filters: DocumentRecordFilter | None = None
) -> int:
Expand Down
2 changes: 2 additions & 0 deletions backend/app/documents/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class Document(Identified):
class DocumentWithRecordsCount(Document):
approved_records_count: int
records_count: int
approved_word_count: int
total_word_count: int


class DocumentRecord(Identified):
Expand Down
12 changes: 12 additions & 0 deletions backend/app/linguistic/word_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import re


def count_words(text: str) -> int:
"""
Count words in a text string.
Simple approach: split by whitespace and count non-empty tokens.
This is intentionally not super-precise as per requirements.
"""
# Remove extra whitespace and split by any whitespace
words = re.findall(r"\S+", text)
return len(words)
6 changes: 6 additions & 0 deletions backend/app/routers/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ def get_docs(
output = []
for doc in docs:
records = query.get_document_records_count_with_approved(doc)
words = query.get_document_word_count_with_approved(doc)
output.append(
doc_schema.DocumentWithRecordsCount(
id=doc.id,
Expand All @@ -70,6 +71,8 @@ def get_docs(
type=doc.type.value,
approved_records_count=records[0],
records_count=records[1],
approved_word_count=words[0],
total_word_count=words[1],
)
)
return output
Expand All @@ -82,6 +85,7 @@ def get_doc(
doc = get_doc_by_id(db, doc_id)
query = GenericDocsQuery(db)
records = query.get_document_records_count_with_approved(doc)
words = query.get_document_word_count_with_approved(doc)
return doc_schema.DocumentWithRecordsCount(
id=doc.id,
name=doc.name,
Expand All @@ -90,6 +94,8 @@ def get_doc(
type=doc.type.value,
approved_records_count=records[0],
records_count=records[1],
approved_word_count=words[0],
total_word_count=words[1],
)


Expand Down
14 changes: 13 additions & 1 deletion backend/tests/routers/test_routes_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,11 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
type=DocumentType.txt,
processing_status="pending",
records=[
DocumentRecord(source="Regional Effects", target="Translation")
DocumentRecord(
source="Regional Effects",
target="Translation",
word_count=2,
)
],
created_by=1,
),
Expand All @@ -63,6 +67,8 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
"type": "txt",
"approved_records_count": 0,
"records_count": 1,
"approved_word_count": 0,
"total_word_count": 2,
},
{
"id": 2,
Expand All @@ -72,6 +78,8 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
"type": "xliff",
"approved_records_count": 0,
"records_count": 0,
"approved_word_count": 0,
"total_word_count": 0,
},
]

Expand All @@ -82,10 +90,12 @@ def test_can_get_document(user_logged_client: TestClient, session: Session):
DocumentRecord(
source="Regional Effects",
target="Translation",
word_count=2,
),
DocumentRecord(
source="User Interface",
target="UI",
word_count=2,
),
]
s.add(
Expand All @@ -109,6 +119,8 @@ def test_can_get_document(user_logged_client: TestClient, session: Session):
"approved_records_count": 0,
"records_count": 2,
"type": "txt",
"approved_word_count": 0,
"total_word_count": 4,
}


Expand Down
67 changes: 67 additions & 0 deletions backend/tests/test_word_count.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
from app.linguistic.word_count import count_words


def test_simple_sentence():
"""Test counting words in a simple sentence."""
assert count_words("Hello world") == 2
assert count_words("This is a test") == 4


def test_empty_string():
"""Test that empty string returns 0."""
assert count_words("") == 0
assert count_words(" ") == 0
assert count_words("\t\n") == 0


def test_single_word():
"""Test counting a single word."""
assert count_words("Hello") == 1
assert count_words("word") == 1


def test_multiple_spaces():
"""Test that multiple spaces are handled correctly."""
assert count_words("Hello world") == 2
assert count_words("This is a test") == 4


def test_leading_trailing_whitespace():
"""Test that leading and trailing whitespace are handled."""
assert count_words(" Hello world ") == 2
assert count_words("\tTest\t") == 1


def test_punctuation():
"""Test that punctuation doesn't affect word count."""
assert count_words("Hello, world!") == 2
assert count_words("This is a test.") == 4


def test_numbers():
"""Test that numbers are counted as words."""
assert count_words("123") == 1
assert count_words("There are 123 items") == 4


def test_mixed_content():
"""Test mixed content with various characters."""
assert count_words("Hello, world! This is test #123.") == 6


def test_unicode():
"""Test that unicode characters are handled."""
assert count_words("Hello 世界") == 2
assert count_words("Привет мир") == 2


def test_newlines_and_tabs():
"""Test that newlines and tabs are treated as whitespace."""
assert count_words("Hello\nworld") == 2
assert count_words("Hello\tworld") == 2
assert count_words("Hello\n\tworld") == 2


def test_consecutive_whitespace():
"""Test that consecutive whitespace is handled."""
assert count_words("Hello \n\t world") == 2
11 changes: 11 additions & 0 deletions backend/tests/test_worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == "Translation"
assert record.target_source == RecordSource.translation_memory
assert not record.approved
assert record.word_count == 2
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
Expand All @@ -136,6 +137,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == ""
assert record.target_source is None
assert not record.approved
assert record.word_count == 2
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
Expand All @@ -148,6 +150,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == "Региональные эффекты"
assert record.target_source is None
assert record.approved
assert record.word_count == 2
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
Expand All @@ -160,6 +163,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == "123456789"
assert record.target_source == RecordSource.full_match
assert record.approved
assert record.word_count == 1
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
Expand All @@ -172,6 +176,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == "Глоссарный перевод"
assert record.target_source == RecordSource.glossary
assert record.approved
assert record.word_count == 2
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
Expand Down Expand Up @@ -223,6 +228,7 @@ def test_process_task_sets_txt_records(session: Session):
== "Soon after the characters enter Camp Greenbriar, read or paraphrase the following text:"
)
assert not record.target
assert record.word_count == 13
assert s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 0

record = doc.records[1]
Expand All @@ -231,6 +237,7 @@ def test_process_task_sets_txt_records(session: Session):
== "“Hello, travelers!” calls an energetic giant sloth wearing a bracelet of claws and feathers."
)
assert not record.target
assert record.word_count == 14
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 91
if crlf
Expand All @@ -243,6 +250,7 @@ def test_process_task_sets_txt_records(session: Session):
== "The creature dangles from a nearby tree and waves a three-clawed paw."
)
assert not record.target
assert record.word_count == 12
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 184
if crlf
Expand All @@ -252,6 +260,7 @@ def test_process_task_sets_txt_records(session: Session):
record = doc.records[3]
assert record.source == "“Fresh faces are always welcome in Camp Greenbriar!”"
assert not record.target
assert record.word_count == 8
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 254
if crlf
Expand All @@ -262,6 +271,7 @@ def test_process_task_sets_txt_records(session: Session):
assert record.source == "The sloth is named Razak."
assert record.target == "Translation"
assert record.target_source == RecordSource.translation_memory
assert record.word_count == 5
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 310
if crlf
Expand All @@ -274,6 +284,7 @@ def test_process_task_sets_txt_records(session: Session):
== "He uses black bear stat block, with the following adjustments:"
)
assert not record.target
assert record.word_count == 10
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 336
if crlf
Expand Down
2 changes: 2 additions & 0 deletions backend/worker.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from app.formats.xliff import XliffSegment, extract_xliff_content
from app.glossary.models import GlossaryRecord
from app.glossary.query import GlossaryQuery
from app.linguistic.word_count import count_words
from app.models import DocumentStatus, MachineTranslationSettings, TaskStatus
from app.schema import DocumentTask
from app.translation_memory.models import TranslationMemoryRecord
Expand Down Expand Up @@ -305,6 +306,7 @@ def create_doc_segments(
target=segment.original_segment.translation or "",
approved=segment.approved,
target_source=segment.segment_source,
word_count=count_words(segment.original_segment.original),
)
for segment in segments
]
Expand Down
5 changes: 4 additions & 1 deletion frontend/mocks/documentMocks.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {DocumentStatus} from '../src/client/schemas/DocumentStatus'
import {DocumentRecordUpdate} from '../src/client/schemas/DocumentRecordUpdate'
import {CommentResponse} from '../src/client/schemas/CommentResponse'
import {DocumentRecord} from '../src/client/schemas/DocumentRecord'
import {DocumentWithRecordsCount} from '../src/client/schemas/DocumentWithRecordsCount'

const segmentComments: CommentResponse[] = [
{
Expand Down Expand Up @@ -228,12 +229,14 @@ const recordsData = {
records: segments,
}

const docs = [
const docs: DocumentWithRecordsCount[] = [
{
id: 1,
created_by: 12,
records_count: segments.length,
approved_records_count: segments.filter(({approved}) => approved).length,
total_word_count: 20,
approved_word_count: 4,
name: 'Some document',
status: 'done' as DocumentStatus,
type: 'XLIFF',
Expand Down
2 changes: 2 additions & 0 deletions frontend/src/client/schemas/DocumentWithRecordsCount.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,6 @@ export interface DocumentWithRecordsCount {
type: string
approved_records_count: number
records_count: number
approved_word_count: number
total_word_count: number
}
Loading