ArchiDevil · ArchiDevil · Jan 11, 2026 · Jan 11, 2026 · Jan 11, 2026
diff --git a/backend/alembic/versions/4a5b6c7d8e9f_add_word_count_to_document_record.py b/backend/alembic/versions/4a5b6c7d8e9f_add_word_count_to_document_record.py
@@ -0,0 +1,52 @@
+"""Add word count to document record
+
+Revision ID: 4a5b6c7d8e9f
+Revises: 3fbb82ea683d
+Create Date: 2026-01-11 15:42:00.000000
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+from app.linguistic.word_count import count_words
+
+
+# pylint: disable=E1101
+
+# revision identifiers, used by Alembic.
+revision: str = "4a5b6c7d8e9f"
+down_revision: Union[str, None] = "3fbb82ea683d"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+    # Add word_count column with default value 0
+    op.add_column(
+        "document_record",
+        sa.Column(
+            "word_count",
+            sa.Integer(),
+            server_default="0",
+            nullable=False,
+        ),
+    )
+
+    # Populate existing records with calculated word counts
+    connection = op.get_bind()
+    result = connection.execute(sa.text("SELECT id, source FROM document_record"))
+    for record_id, source in result:
+        word_count = count_words(source)
+        connection.execute(
+            sa.text(
+                "UPDATE document_record SET word_count = :word_count WHERE id = :record_id"
+            ),
+            {"word_count": word_count, "record_id": record_id},
+        )
+
+
+def downgrade() -> None:
+    op.drop_column("document_record", "word_count")
diff --git a/backend/app/documents/models.py b/backend/app/documents/models.py
@@ -113,6 +113,7 @@ class DocumentRecord(Base):
     target: Mapped[str] = mapped_column()
     approved: Mapped[bool] = mapped_column(default=False)
     target_source: Mapped[RecordSource] = mapped_column(nullable=True)
+    word_count: Mapped[int] = mapped_column(default=0)
 
     document: Mapped["Document"] = relationship(back_populates="records")
     comments: Mapped[list["Comment"]] = relationship(

diff --git a/backend/app/documents/query.py b/backend/app/documents/query.py
@@ -88,6 +88,22 @@ def get_document_records_count_with_approved(
         result = self.__db.execute(stmt).one()
         return result[0], result[1]
 
+    def get_document_word_count_with_approved(self, doc: Document) -> tuple[int, int]:
+        """
+        Returns tuple of (approved_word_count, total_word_count) for a document.
+        """
+        stmt = select(
+            func.sum(
+                case(
+                    (DocumentRecord.approved.is_(True), DocumentRecord.word_count),
+                    else_=0,
+                )
+            ),
+            func.sum(DocumentRecord.word_count),
+        ).where(DocumentRecord.document_id == doc.id)
+        result = self.__db.execute(stmt).one()
+        return result[0] or 0, result[1] or 0
+
     def get_document_records_count_filtered(
         self, doc: Document, filters: DocumentRecordFilter | None = None
     ) -> int:

diff --git a/backend/app/documents/schema.py b/backend/app/documents/schema.py
@@ -23,6 +23,8 @@ class Document(Identified):
 class DocumentWithRecordsCount(Document):
     approved_records_count: int
     records_count: int
+    approved_word_count: int
+    total_word_count: int
 
 
 class DocumentRecord(Identified):

diff --git a/backend/app/linguistic/word_count.py b/backend/app/linguistic/word_count.py
@@ -0,0 +1,12 @@
+import re
+
+
+def count_words(text: str) -> int:
+    """
+    Count words in a text string.
+    Simple approach: split by whitespace and count non-empty tokens.
+    This is intentionally not super-precise as per requirements.
+    """
+    # Remove extra whitespace and split by any whitespace
+    words = re.findall(r"\S+", text)
+    return len(words)
diff --git a/backend/app/routers/document.py b/backend/app/routers/document.py
@@ -61,6 +61,7 @@ def get_docs(
     output = []
     for doc in docs:
         records = query.get_document_records_count_with_approved(doc)
+        words = query.get_document_word_count_with_approved(doc)
         output.append(
             doc_schema.DocumentWithRecordsCount(
                 id=doc.id,
@@ -70,6 +71,8 @@ def get_docs(
                 type=doc.type.value,
                 approved_records_count=records[0],
                 records_count=records[1],
+                approved_word_count=words[0],
+                total_word_count=words[1],
             )
         )
     return output
@@ -82,6 +85,7 @@ def get_doc(
     doc = get_doc_by_id(db, doc_id)
     query = GenericDocsQuery(db)
     records = query.get_document_records_count_with_approved(doc)
+    words = query.get_document_word_count_with_approved(doc)
     return doc_schema.DocumentWithRecordsCount(
         id=doc.id,
         name=doc.name,
@@ -90,6 +94,8 @@ def get_doc(
         type=doc.type.value,
         approved_records_count=records[0],
         records_count=records[1],
+        approved_word_count=words[0],
+        total_word_count=words[1],
     )
 
 

diff --git a/backend/tests/routers/test_routes_documents.py b/backend/tests/routers/test_routes_documents.py
@@ -38,7 +38,11 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
                     type=DocumentType.txt,
                     processing_status="pending",
                     records=[
-                        DocumentRecord(source="Regional Effects", target="Translation")
+                        DocumentRecord(
+                            source="Regional Effects",
+                            target="Translation",
+                            word_count=2,
+                        )
                     ],
                     created_by=1,
                 ),
@@ -63,6 +67,8 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
             "type": "txt",
             "approved_records_count": 0,
             "records_count": 1,
+            "approved_word_count": 0,
+            "total_word_count": 2,
         },
         {
             "id": 2,
@@ -72,6 +78,8 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
             "type": "xliff",
             "approved_records_count": 0,
             "records_count": 0,
+            "approved_word_count": 0,
+            "total_word_count": 0,
         },
     ]
 
@@ -82,10 +90,12 @@ def test_can_get_document(user_logged_client: TestClient, session: Session):
             DocumentRecord(
                 source="Regional Effects",
                 target="Translation",
+                word_count=2,
             ),
             DocumentRecord(
                 source="User Interface",
                 target="UI",
+                word_count=2,
             ),
         ]
         s.add(
@@ -109,6 +119,8 @@ def test_can_get_document(user_logged_client: TestClient, session: Session):
         "approved_records_count": 0,
         "records_count": 2,
         "type": "txt",
+        "approved_word_count": 0,
+        "total_word_count": 4,
     }
 
 

diff --git a/backend/tests/test_word_count.py b/backend/tests/test_word_count.py
@@ -0,0 +1,67 @@
+from app.linguistic.word_count import count_words
+
+
+def test_simple_sentence():
+    """Test counting words in a simple sentence."""
+    assert count_words("Hello world") == 2
+    assert count_words("This is a test") == 4
+
+
+def test_empty_string():
+    """Test that empty string returns 0."""
+    assert count_words("") == 0
+    assert count_words("   ") == 0
+    assert count_words("\t\n") == 0
+
+
+def test_single_word():
+    """Test counting a single word."""
+    assert count_words("Hello") == 1
+    assert count_words("word") == 1
+
+
+def test_multiple_spaces():
+    """Test that multiple spaces are handled correctly."""
+    assert count_words("Hello    world") == 2
+    assert count_words("This  is  a  test") == 4
+
+
+def test_leading_trailing_whitespace():
+    """Test that leading and trailing whitespace are handled."""
+    assert count_words("  Hello world  ") == 2
+    assert count_words("\tTest\t") == 1
+
+
+def test_punctuation():
+    """Test that punctuation doesn't affect word count."""
+    assert count_words("Hello, world!") == 2
+    assert count_words("This is a test.") == 4
+
+
+def test_numbers():
+    """Test that numbers are counted as words."""
+    assert count_words("123") == 1
+    assert count_words("There are 123 items") == 4
+
+
+def test_mixed_content():
+    """Test mixed content with various characters."""
+    assert count_words("Hello, world! This is test #123.") == 6
+
+
+def test_unicode():
+    """Test that unicode characters are handled."""
+    assert count_words("Hello 世界") == 2
+    assert count_words("Привет мир") == 2
+
+
+def test_newlines_and_tabs():
+    """Test that newlines and tabs are treated as whitespace."""
+    assert count_words("Hello\nworld") == 2
+    assert count_words("Hello\tworld") == 2
+    assert count_words("Hello\n\tworld") == 2
+
+
+def test_consecutive_whitespace():
+    """Test that consecutive whitespace is handled."""
+    assert count_words("Hello   \n\t   world") == 2
diff --git a/backend/tests/test_worker.py b/backend/tests/test_worker.py
@@ -124,6 +124,7 @@ def test_process_task_sets_xliff_records(session: Session):
         assert record.target == "Translation"
         assert record.target_source == RecordSource.translation_memory
         assert not record.approved
+        assert record.word_count == 2
         xliff_record = (
             s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
         )
@@ -136,6 +137,7 @@ def test_process_task_sets_xliff_records(session: Session):
         assert record.target == ""
         assert record.target_source is None
         assert not record.approved
+        assert record.word_count == 2
         xliff_record = (
             s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
         )
@@ -148,6 +150,7 @@ def test_process_task_sets_xliff_records(session: Session):
         assert record.target == "Региональные эффекты"
         assert record.target_source is None
         assert record.approved
+        assert record.word_count == 2
         xliff_record = (
             s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
         )
@@ -160,6 +163,7 @@ def test_process_task_sets_xliff_records(session: Session):
         assert record.target == "123456789"
         assert record.target_source == RecordSource.full_match
         assert record.approved
+        assert record.word_count == 1
         xliff_record = (
             s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
         )
@@ -172,6 +176,7 @@ def test_process_task_sets_xliff_records(session: Session):
         assert record.target == "Глоссарный перевод"
         assert record.target_source == RecordSource.glossary
         assert record.approved
+        assert record.word_count == 2
         xliff_record = (
             s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
         )
@@ -223,6 +228,7 @@ def test_process_task_sets_txt_records(session: Session):
             == "Soon after the characters enter Camp Greenbriar, read or paraphrase the following text:"
         )
         assert not record.target
+        assert record.word_count == 13
         assert s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 0
 
         record = doc.records[1]
@@ -231,6 +237,7 @@ def test_process_task_sets_txt_records(session: Session):
             == "“Hello, travelers!” calls an energetic giant sloth wearing a bracelet of claws and feathers."
         )
         assert not record.target
+        assert record.word_count == 14
         assert (
             s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 91
             if crlf
@@ -243,6 +250,7 @@ def test_process_task_sets_txt_records(session: Session):
             == "The creature dangles from a nearby tree and waves a three-clawed paw."
         )
         assert not record.target
+        assert record.word_count == 12
         assert (
             s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 184
             if crlf
@@ -252,6 +260,7 @@ def test_process_task_sets_txt_records(session: Session):
         record = doc.records[3]
         assert record.source == "“Fresh faces are always welcome in Camp Greenbriar!”"
         assert not record.target
+        assert record.word_count == 8
         assert (
             s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 254
             if crlf
@@ -262,6 +271,7 @@ def test_process_task_sets_txt_records(session: Session):
         assert record.source == "The sloth is named Razak."
         assert record.target == "Translation"
         assert record.target_source == RecordSource.translation_memory
+        assert record.word_count == 5
         assert (
             s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 310
             if crlf
@@ -274,6 +284,7 @@ def test_process_task_sets_txt_records(session: Session):
             == "He uses black bear stat block, with the following adjustments:"
         )
         assert not record.target
+        assert record.word_count == 10
         assert (
             s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 336
             if crlf

diff --git a/backend/worker.py b/backend/worker.py
@@ -24,6 +24,7 @@
 from app.formats.xliff import XliffSegment, extract_xliff_content
 from app.glossary.models import GlossaryRecord
 from app.glossary.query import GlossaryQuery
+from app.linguistic.word_count import count_words
 from app.models import DocumentStatus, MachineTranslationSettings, TaskStatus
 from app.schema import DocumentTask
 from app.translation_memory.models import TranslationMemoryRecord
@@ -305,6 +306,7 @@ def create_doc_segments(
             target=segment.original_segment.translation or "",
             approved=segment.approved,
             target_source=segment.segment_source,
+            word_count=count_words(segment.original_segment.original),
         )
         for segment in segments
     ]

diff --git a/frontend/mocks/documentMocks.ts b/frontend/mocks/documentMocks.ts
@@ -17,6 +17,7 @@ import {DocumentStatus} from '../src/client/schemas/DocumentStatus'
 import {DocumentRecordUpdate} from '../src/client/schemas/DocumentRecordUpdate'
 import {CommentResponse} from '../src/client/schemas/CommentResponse'
 import {DocumentRecord} from '../src/client/schemas/DocumentRecord'
+import {DocumentWithRecordsCount} from '../src/client/schemas/DocumentWithRecordsCount'
 
 const segmentComments: CommentResponse[] = [
   {
@@ -228,12 +229,14 @@ const recordsData = {
   records: segments,
 }
 
-const docs = [
+const docs: DocumentWithRecordsCount[] = [
   {
     id: 1,
     created_by: 12,
     records_count: segments.length,
     approved_records_count: segments.filter(({approved}) => approved).length,
+    total_word_count: 20,
+    approved_word_count: 4,
     name: 'Some document',
     status: 'done' as DocumentStatus,
     type: 'XLIFF',

diff --git a/frontend/src/client/schemas/DocumentWithRecordsCount.ts b/frontend/src/client/schemas/DocumentWithRecordsCount.ts
@@ -10,4 +10,6 @@ export interface DocumentWithRecordsCount {
   type: string
   approved_records_count: number
   records_count: number
+  approved_word_count: number
+  total_word_count: number
 }