diff --git a/backend/alembic/versions/4a5b6c7d8e9f_add_word_count_to_document_record.py b/backend/alembic/versions/4a5b6c7d8e9f_add_word_count_to_document_record.py
new file mode 100644
index 0000000..749d4d5
--- /dev/null
+++ b/backend/alembic/versions/4a5b6c7d8e9f_add_word_count_to_document_record.py
@@ -0,0 +1,52 @@
+"""Add word count to document record
+
+Revision ID: 4a5b6c7d8e9f
+Revises: 3fbb82ea683d
+Create Date: 2026-01-11 15:42:00.000000
+
+"""
+
+from typing import Sequence, Union
+
+from alembic import op
+import sqlalchemy as sa
+
+from app.linguistic.word_count import count_words
+
+
+# pylint: disable=E1101
+
+# revision identifiers, used by Alembic.
+revision: str = "4a5b6c7d8e9f"
+down_revision: Union[str, None] = "3fbb82ea683d"
+branch_labels: Union[str, Sequence[str], None] = None
+depends_on: Union[str, Sequence[str], None] = None
+
+
+def upgrade() -> None:
+ # Add word_count column with default value 0
+ op.add_column(
+ "document_record",
+ sa.Column(
+ "word_count",
+ sa.Integer(),
+ server_default="0",
+ nullable=False,
+ ),
+ )
+
+ # Populate existing records with calculated word counts
+ connection = op.get_bind()
+ result = connection.execute(sa.text("SELECT id, source FROM document_record"))
+ for record_id, source in result:
+ word_count = count_words(source)
+ connection.execute(
+ sa.text(
+ "UPDATE document_record SET word_count = :word_count WHERE id = :record_id"
+ ),
+ {"word_count": word_count, "record_id": record_id},
+ )
+
+
+def downgrade() -> None:
+ op.drop_column("document_record", "word_count")
diff --git a/backend/app/documents/models.py b/backend/app/documents/models.py
index 4450ebd..d3924fe 100644
--- a/backend/app/documents/models.py
+++ b/backend/app/documents/models.py
@@ -113,6 +113,7 @@ class DocumentRecord(Base):
target: Mapped[str] = mapped_column()
approved: Mapped[bool] = mapped_column(default=False)
target_source: Mapped[RecordSource] = mapped_column(nullable=True)
+ word_count: Mapped[int] = mapped_column(default=0)
document: Mapped["Document"] = relationship(back_populates="records")
comments: Mapped[list["Comment"]] = relationship(
diff --git a/backend/app/documents/query.py b/backend/app/documents/query.py
index de0b370..7e01035 100644
--- a/backend/app/documents/query.py
+++ b/backend/app/documents/query.py
@@ -88,6 +88,22 @@ def get_document_records_count_with_approved(
result = self.__db.execute(stmt).one()
return result[0], result[1]
+ def get_document_word_count_with_approved(self, doc: Document) -> tuple[int, int]:
+ """
+ Returns tuple of (approved_word_count, total_word_count) for a document.
+ """
+ stmt = select(
+ func.sum(
+ case(
+ (DocumentRecord.approved.is_(True), DocumentRecord.word_count),
+ else_=0,
+ )
+ ),
+ func.sum(DocumentRecord.word_count),
+ ).where(DocumentRecord.document_id == doc.id)
+ result = self.__db.execute(stmt).one()
+ return result[0] or 0, result[1] or 0
+
def get_document_records_count_filtered(
self, doc: Document, filters: DocumentRecordFilter | None = None
) -> int:
diff --git a/backend/app/documents/schema.py b/backend/app/documents/schema.py
index 73eb8b9..4a68544 100644
--- a/backend/app/documents/schema.py
+++ b/backend/app/documents/schema.py
@@ -23,6 +23,8 @@ class Document(Identified):
class DocumentWithRecordsCount(Document):
approved_records_count: int
records_count: int
+ approved_word_count: int
+ total_word_count: int
class DocumentRecord(Identified):
diff --git a/backend/app/linguistic/word_count.py b/backend/app/linguistic/word_count.py
new file mode 100644
index 0000000..d0e22ca
--- /dev/null
+++ b/backend/app/linguistic/word_count.py
@@ -0,0 +1,12 @@
+import re
+
+
+def count_words(text: str) -> int:
+ """
+ Count words in a text string.
+ Simple approach: split by whitespace and count non-empty tokens.
+ This is intentionally not super-precise as per requirements.
+ """
+ # Remove extra whitespace and split by any whitespace
+ words = re.findall(r"\S+", text)
+ return len(words)
diff --git a/backend/app/routers/document.py b/backend/app/routers/document.py
index b639ee0..e9e35e9 100644
--- a/backend/app/routers/document.py
+++ b/backend/app/routers/document.py
@@ -61,6 +61,7 @@ def get_docs(
output = []
for doc in docs:
records = query.get_document_records_count_with_approved(doc)
+ words = query.get_document_word_count_with_approved(doc)
output.append(
doc_schema.DocumentWithRecordsCount(
id=doc.id,
@@ -70,6 +71,8 @@ def get_docs(
type=doc.type.value,
approved_records_count=records[0],
records_count=records[1],
+ approved_word_count=words[0],
+ total_word_count=words[1],
)
)
return output
@@ -82,6 +85,7 @@ def get_doc(
doc = get_doc_by_id(db, doc_id)
query = GenericDocsQuery(db)
records = query.get_document_records_count_with_approved(doc)
+ words = query.get_document_word_count_with_approved(doc)
return doc_schema.DocumentWithRecordsCount(
id=doc.id,
name=doc.name,
@@ -90,6 +94,8 @@ def get_doc(
type=doc.type.value,
approved_records_count=records[0],
records_count=records[1],
+ approved_word_count=words[0],
+ total_word_count=words[1],
)
diff --git a/backend/tests/routers/test_routes_documents.py b/backend/tests/routers/test_routes_documents.py
index e93112b..fd4a5db 100644
--- a/backend/tests/routers/test_routes_documents.py
+++ b/backend/tests/routers/test_routes_documents.py
@@ -38,7 +38,11 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
type=DocumentType.txt,
processing_status="pending",
records=[
- DocumentRecord(source="Regional Effects", target="Translation")
+ DocumentRecord(
+ source="Regional Effects",
+ target="Translation",
+ word_count=2,
+ )
],
created_by=1,
),
@@ -63,6 +67,8 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
"type": "txt",
"approved_records_count": 0,
"records_count": 1,
+ "approved_word_count": 0,
+ "total_word_count": 2,
},
{
"id": 2,
@@ -72,6 +78,8 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session):
"type": "xliff",
"approved_records_count": 0,
"records_count": 0,
+ "approved_word_count": 0,
+ "total_word_count": 0,
},
]
@@ -82,10 +90,12 @@ def test_can_get_document(user_logged_client: TestClient, session: Session):
DocumentRecord(
source="Regional Effects",
target="Translation",
+ word_count=2,
),
DocumentRecord(
source="User Interface",
target="UI",
+ word_count=2,
),
]
s.add(
@@ -109,6 +119,8 @@ def test_can_get_document(user_logged_client: TestClient, session: Session):
"approved_records_count": 0,
"records_count": 2,
"type": "txt",
+ "approved_word_count": 0,
+ "total_word_count": 4,
}
diff --git a/backend/tests/test_word_count.py b/backend/tests/test_word_count.py
new file mode 100644
index 0000000..d70b3c2
--- /dev/null
+++ b/backend/tests/test_word_count.py
@@ -0,0 +1,67 @@
+from app.linguistic.word_count import count_words
+
+
+def test_simple_sentence():
+ """Test counting words in a simple sentence."""
+ assert count_words("Hello world") == 2
+ assert count_words("This is a test") == 4
+
+
+def test_empty_string():
+ """Test that empty string returns 0."""
+ assert count_words("") == 0
+ assert count_words(" ") == 0
+ assert count_words("\t\n") == 0
+
+
+def test_single_word():
+ """Test counting a single word."""
+ assert count_words("Hello") == 1
+ assert count_words("word") == 1
+
+
+def test_multiple_spaces():
+ """Test that multiple spaces are handled correctly."""
+ assert count_words("Hello world") == 2
+ assert count_words("This is a test") == 4
+
+
+def test_leading_trailing_whitespace():
+ """Test that leading and trailing whitespace are handled."""
+ assert count_words(" Hello world ") == 2
+ assert count_words("\tTest\t") == 1
+
+
+def test_punctuation():
+ """Test that punctuation doesn't affect word count."""
+ assert count_words("Hello, world!") == 2
+ assert count_words("This is a test.") == 4
+
+
+def test_numbers():
+ """Test that numbers are counted as words."""
+ assert count_words("123") == 1
+ assert count_words("There are 123 items") == 4
+
+
+def test_mixed_content():
+ """Test mixed content with various characters."""
+ assert count_words("Hello, world! This is test #123.") == 6
+
+
+def test_unicode():
+ """Test that unicode characters are handled."""
+ assert count_words("Hello 世界") == 2
+ assert count_words("Привет мир") == 2
+
+
+def test_newlines_and_tabs():
+ """Test that newlines and tabs are treated as whitespace."""
+ assert count_words("Hello\nworld") == 2
+ assert count_words("Hello\tworld") == 2
+ assert count_words("Hello\n\tworld") == 2
+
+
+def test_consecutive_whitespace():
+ """Test that consecutive whitespace is handled."""
+ assert count_words("Hello \n\t world") == 2
diff --git a/backend/tests/test_worker.py b/backend/tests/test_worker.py
index f42ff53..cf6fb06 100644
--- a/backend/tests/test_worker.py
+++ b/backend/tests/test_worker.py
@@ -124,6 +124,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == "Translation"
assert record.target_source == RecordSource.translation_memory
assert not record.approved
+ assert record.word_count == 2
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
@@ -136,6 +137,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == ""
assert record.target_source is None
assert not record.approved
+ assert record.word_count == 2
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
@@ -148,6 +150,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == "Региональные эффекты"
assert record.target_source is None
assert record.approved
+ assert record.word_count == 2
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
@@ -160,6 +163,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == "123456789"
assert record.target_source == RecordSource.full_match
assert record.approved
+ assert record.word_count == 1
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
@@ -172,6 +176,7 @@ def test_process_task_sets_xliff_records(session: Session):
assert record.target == "Глоссарный перевод"
assert record.target_source == RecordSource.glossary
assert record.approved
+ assert record.word_count == 2
xliff_record = (
s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one()
)
@@ -223,6 +228,7 @@ def test_process_task_sets_txt_records(session: Session):
== "Soon after the characters enter Camp Greenbriar, read or paraphrase the following text:"
)
assert not record.target
+ assert record.word_count == 13
assert s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 0
record = doc.records[1]
@@ -231,6 +237,7 @@ def test_process_task_sets_txt_records(session: Session):
== "“Hello, travelers!” calls an energetic giant sloth wearing a bracelet of claws and feathers."
)
assert not record.target
+ assert record.word_count == 14
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 91
if crlf
@@ -243,6 +250,7 @@ def test_process_task_sets_txt_records(session: Session):
== "The creature dangles from a nearby tree and waves a three-clawed paw."
)
assert not record.target
+ assert record.word_count == 12
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 184
if crlf
@@ -252,6 +260,7 @@ def test_process_task_sets_txt_records(session: Session):
record = doc.records[3]
assert record.source == "“Fresh faces are always welcome in Camp Greenbriar!”"
assert not record.target
+ assert record.word_count == 8
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 254
if crlf
@@ -262,6 +271,7 @@ def test_process_task_sets_txt_records(session: Session):
assert record.source == "The sloth is named Razak."
assert record.target == "Translation"
assert record.target_source == RecordSource.translation_memory
+ assert record.word_count == 5
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 310
if crlf
@@ -274,6 +284,7 @@ def test_process_task_sets_txt_records(session: Session):
== "He uses black bear stat block, with the following adjustments:"
)
assert not record.target
+ assert record.word_count == 10
assert (
s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 336
if crlf
diff --git a/backend/worker.py b/backend/worker.py
index dd0f9ef..0086c86 100644
--- a/backend/worker.py
+++ b/backend/worker.py
@@ -24,6 +24,7 @@
from app.formats.xliff import XliffSegment, extract_xliff_content
from app.glossary.models import GlossaryRecord
from app.glossary.query import GlossaryQuery
+from app.linguistic.word_count import count_words
from app.models import DocumentStatus, MachineTranslationSettings, TaskStatus
from app.schema import DocumentTask
from app.translation_memory.models import TranslationMemoryRecord
@@ -305,6 +306,7 @@ def create_doc_segments(
target=segment.original_segment.translation or "",
approved=segment.approved,
target_source=segment.segment_source,
+ word_count=count_words(segment.original_segment.original),
)
for segment in segments
]
diff --git a/frontend/mocks/documentMocks.ts b/frontend/mocks/documentMocks.ts
index b0ba6b5..c213c54 100644
--- a/frontend/mocks/documentMocks.ts
+++ b/frontend/mocks/documentMocks.ts
@@ -17,6 +17,7 @@ import {DocumentStatus} from '../src/client/schemas/DocumentStatus'
import {DocumentRecordUpdate} from '../src/client/schemas/DocumentRecordUpdate'
import {CommentResponse} from '../src/client/schemas/CommentResponse'
import {DocumentRecord} from '../src/client/schemas/DocumentRecord'
+import {DocumentWithRecordsCount} from '../src/client/schemas/DocumentWithRecordsCount'
const segmentComments: CommentResponse[] = [
{
@@ -228,12 +229,14 @@ const recordsData = {
records: segments,
}
-const docs = [
+const docs: DocumentWithRecordsCount[] = [
{
id: 1,
created_by: 12,
records_count: segments.length,
approved_records_count: segments.filter(({approved}) => approved).length,
+ total_word_count: 20,
+ approved_word_count: 4,
name: 'Some document',
status: 'done' as DocumentStatus,
type: 'XLIFF',
diff --git a/frontend/src/client/schemas/DocumentWithRecordsCount.ts b/frontend/src/client/schemas/DocumentWithRecordsCount.ts
index d9ac2c6..2e9424a 100644
--- a/frontend/src/client/schemas/DocumentWithRecordsCount.ts
+++ b/frontend/src/client/schemas/DocumentWithRecordsCount.ts
@@ -10,4 +10,6 @@ export interface DocumentWithRecordsCount {
type: string
approved_records_count: number
records_count: number
+ approved_word_count: number
+ total_word_count: number
}
diff --git a/frontend/src/components/DocumentListItem.vue b/frontend/src/components/DocumentListItem.vue
index 6aeb6ae..e14c98b 100644
--- a/frontend/src/components/DocumentListItem.vue
+++ b/frontend/src/components/DocumentListItem.vue
@@ -27,7 +27,12 @@ const classes = computed(() => {
})
const progressBarTitle = computed(() => {
- return `Segments: ${props.document.approved_records_count}/${props.document.records_count}`
+ return `Words: ${props.document.approved_word_count}/${props.document.total_word_count}`
+})
+
+const progressValue = computed(() => {
+ if (props.document.total_word_count === 0) return 0
+ return (props.document.approved_word_count / props.document.total_word_count) * 100
})
const busy = ref(false)
@@ -74,7 +79,7 @@ const deleteFile = async () => {
diff --git a/frontend/src/views/DocView.vue b/frontend/src/views/DocView.vue
index f5960d8..fb8e211 100644
--- a/frontend/src/views/DocView.vue
+++ b/frontend/src/views/DocView.vue
@@ -72,8 +72,8 @@ const documentDownloadLink = computed(() => {
const translationProgress = computed(() => {
const doc = document.value
- if (!doc) return 0
- return (doc.approved_records_count / doc.records_count) * 100
+ if (!doc || doc.total_word_count === 0) return 0
+ return (doc.approved_word_count / doc.total_word_count) * 100
})
const updatePage = async (event: PageState) => {
@@ -229,8 +229,8 @@ const showAddTermModal = ref(false)
:value="translationProgress"
:show-value="false"
/>
- {{ document?.approved_records_count }} /
- {{ document?.records_count }}
+ {{ document?.approved_word_count }} /
+ {{ document?.total_word_count }} words