diff --git a/backend/alembic/versions/4a5b6c7d8e9f_add_word_count_to_document_record.py b/backend/alembic/versions/4a5b6c7d8e9f_add_word_count_to_document_record.py new file mode 100644 index 0000000..749d4d5 --- /dev/null +++ b/backend/alembic/versions/4a5b6c7d8e9f_add_word_count_to_document_record.py @@ -0,0 +1,52 @@ +"""Add word count to document record + +Revision ID: 4a5b6c7d8e9f +Revises: 3fbb82ea683d +Create Date: 2026-01-11 15:42:00.000000 + +""" + +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + +from app.linguistic.word_count import count_words + + +# pylint: disable=E1101 + +# revision identifiers, used by Alembic. +revision: str = "4a5b6c7d8e9f" +down_revision: Union[str, None] = "3fbb82ea683d" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # Add word_count column with default value 0 + op.add_column( + "document_record", + sa.Column( + "word_count", + sa.Integer(), + server_default="0", + nullable=False, + ), + ) + + # Populate existing records with calculated word counts + connection = op.get_bind() + result = connection.execute(sa.text("SELECT id, source FROM document_record")) + for record_id, source in result: + word_count = count_words(source) + connection.execute( + sa.text( + "UPDATE document_record SET word_count = :word_count WHERE id = :record_id" + ), + {"word_count": word_count, "record_id": record_id}, + ) + + +def downgrade() -> None: + op.drop_column("document_record", "word_count") diff --git a/backend/app/documents/models.py b/backend/app/documents/models.py index 4450ebd..d3924fe 100644 --- a/backend/app/documents/models.py +++ b/backend/app/documents/models.py @@ -113,6 +113,7 @@ class DocumentRecord(Base): target: Mapped[str] = mapped_column() approved: Mapped[bool] = mapped_column(default=False) target_source: Mapped[RecordSource] = mapped_column(nullable=True) + word_count: Mapped[int] = mapped_column(default=0) document: Mapped["Document"] = relationship(back_populates="records") comments: Mapped[list["Comment"]] = relationship( diff --git a/backend/app/documents/query.py b/backend/app/documents/query.py index de0b370..7e01035 100644 --- a/backend/app/documents/query.py +++ b/backend/app/documents/query.py @@ -88,6 +88,22 @@ def get_document_records_count_with_approved( result = self.__db.execute(stmt).one() return result[0], result[1] + def get_document_word_count_with_approved(self, doc: Document) -> tuple[int, int]: + """ + Returns tuple of (approved_word_count, total_word_count) for a document. + """ + stmt = select( + func.sum( + case( + (DocumentRecord.approved.is_(True), DocumentRecord.word_count), + else_=0, + ) + ), + func.sum(DocumentRecord.word_count), + ).where(DocumentRecord.document_id == doc.id) + result = self.__db.execute(stmt).one() + return result[0] or 0, result[1] or 0 + def get_document_records_count_filtered( self, doc: Document, filters: DocumentRecordFilter | None = None ) -> int: diff --git a/backend/app/documents/schema.py b/backend/app/documents/schema.py index 73eb8b9..4a68544 100644 --- a/backend/app/documents/schema.py +++ b/backend/app/documents/schema.py @@ -23,6 +23,8 @@ class Document(Identified): class DocumentWithRecordsCount(Document): approved_records_count: int records_count: int + approved_word_count: int + total_word_count: int class DocumentRecord(Identified): diff --git a/backend/app/linguistic/word_count.py b/backend/app/linguistic/word_count.py new file mode 100644 index 0000000..d0e22ca --- /dev/null +++ b/backend/app/linguistic/word_count.py @@ -0,0 +1,12 @@ +import re + + +def count_words(text: str) -> int: + """ + Count words in a text string. + Simple approach: split by whitespace and count non-empty tokens. + This is intentionally not super-precise as per requirements. + """ + # Remove extra whitespace and split by any whitespace + words = re.findall(r"\S+", text) + return len(words) diff --git a/backend/app/routers/document.py b/backend/app/routers/document.py index b639ee0..e9e35e9 100644 --- a/backend/app/routers/document.py +++ b/backend/app/routers/document.py @@ -61,6 +61,7 @@ def get_docs( output = [] for doc in docs: records = query.get_document_records_count_with_approved(doc) + words = query.get_document_word_count_with_approved(doc) output.append( doc_schema.DocumentWithRecordsCount( id=doc.id, @@ -70,6 +71,8 @@ def get_docs( type=doc.type.value, approved_records_count=records[0], records_count=records[1], + approved_word_count=words[0], + total_word_count=words[1], ) ) return output @@ -82,6 +85,7 @@ def get_doc( doc = get_doc_by_id(db, doc_id) query = GenericDocsQuery(db) records = query.get_document_records_count_with_approved(doc) + words = query.get_document_word_count_with_approved(doc) return doc_schema.DocumentWithRecordsCount( id=doc.id, name=doc.name, @@ -90,6 +94,8 @@ def get_doc( type=doc.type.value, approved_records_count=records[0], records_count=records[1], + approved_word_count=words[0], + total_word_count=words[1], ) diff --git a/backend/tests/routers/test_routes_documents.py b/backend/tests/routers/test_routes_documents.py index e93112b..fd4a5db 100644 --- a/backend/tests/routers/test_routes_documents.py +++ b/backend/tests/routers/test_routes_documents.py @@ -38,7 +38,11 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session): type=DocumentType.txt, processing_status="pending", records=[ - DocumentRecord(source="Regional Effects", target="Translation") + DocumentRecord( + source="Regional Effects", + target="Translation", + word_count=2, + ) ], created_by=1, ), @@ -63,6 +67,8 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session): "type": "txt", "approved_records_count": 0, "records_count": 1, + "approved_word_count": 0, + "total_word_count": 2, }, { "id": 2, @@ -72,6 +78,8 @@ def test_can_get_list_of_docs(user_logged_client: TestClient, session: Session): "type": "xliff", "approved_records_count": 0, "records_count": 0, + "approved_word_count": 0, + "total_word_count": 0, }, ] @@ -82,10 +90,12 @@ def test_can_get_document(user_logged_client: TestClient, session: Session): DocumentRecord( source="Regional Effects", target="Translation", + word_count=2, ), DocumentRecord( source="User Interface", target="UI", + word_count=2, ), ] s.add( @@ -109,6 +119,8 @@ def test_can_get_document(user_logged_client: TestClient, session: Session): "approved_records_count": 0, "records_count": 2, "type": "txt", + "approved_word_count": 0, + "total_word_count": 4, } diff --git a/backend/tests/test_word_count.py b/backend/tests/test_word_count.py new file mode 100644 index 0000000..d70b3c2 --- /dev/null +++ b/backend/tests/test_word_count.py @@ -0,0 +1,67 @@ +from app.linguistic.word_count import count_words + + +def test_simple_sentence(): + """Test counting words in a simple sentence.""" + assert count_words("Hello world") == 2 + assert count_words("This is a test") == 4 + + +def test_empty_string(): + """Test that empty string returns 0.""" + assert count_words("") == 0 + assert count_words(" ") == 0 + assert count_words("\t\n") == 0 + + +def test_single_word(): + """Test counting a single word.""" + assert count_words("Hello") == 1 + assert count_words("word") == 1 + + +def test_multiple_spaces(): + """Test that multiple spaces are handled correctly.""" + assert count_words("Hello world") == 2 + assert count_words("This is a test") == 4 + + +def test_leading_trailing_whitespace(): + """Test that leading and trailing whitespace are handled.""" + assert count_words(" Hello world ") == 2 + assert count_words("\tTest\t") == 1 + + +def test_punctuation(): + """Test that punctuation doesn't affect word count.""" + assert count_words("Hello, world!") == 2 + assert count_words("This is a test.") == 4 + + +def test_numbers(): + """Test that numbers are counted as words.""" + assert count_words("123") == 1 + assert count_words("There are 123 items") == 4 + + +def test_mixed_content(): + """Test mixed content with various characters.""" + assert count_words("Hello, world! This is test #123.") == 6 + + +def test_unicode(): + """Test that unicode characters are handled.""" + assert count_words("Hello 世界") == 2 + assert count_words("Привет мир") == 2 + + +def test_newlines_and_tabs(): + """Test that newlines and tabs are treated as whitespace.""" + assert count_words("Hello\nworld") == 2 + assert count_words("Hello\tworld") == 2 + assert count_words("Hello\n\tworld") == 2 + + +def test_consecutive_whitespace(): + """Test that consecutive whitespace is handled.""" + assert count_words("Hello \n\t world") == 2 diff --git a/backend/tests/test_worker.py b/backend/tests/test_worker.py index f42ff53..cf6fb06 100644 --- a/backend/tests/test_worker.py +++ b/backend/tests/test_worker.py @@ -124,6 +124,7 @@ def test_process_task_sets_xliff_records(session: Session): assert record.target == "Translation" assert record.target_source == RecordSource.translation_memory assert not record.approved + assert record.word_count == 2 xliff_record = ( s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one() ) @@ -136,6 +137,7 @@ def test_process_task_sets_xliff_records(session: Session): assert record.target == "" assert record.target_source is None assert not record.approved + assert record.word_count == 2 xliff_record = ( s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one() ) @@ -148,6 +150,7 @@ def test_process_task_sets_xliff_records(session: Session): assert record.target == "Региональные эффекты" assert record.target_source is None assert record.approved + assert record.word_count == 2 xliff_record = ( s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one() ) @@ -160,6 +163,7 @@ def test_process_task_sets_xliff_records(session: Session): assert record.target == "123456789" assert record.target_source == RecordSource.full_match assert record.approved + assert record.word_count == 1 xliff_record = ( s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one() ) @@ -172,6 +176,7 @@ def test_process_task_sets_xliff_records(session: Session): assert record.target == "Глоссарный перевод" assert record.target_source == RecordSource.glossary assert record.approved + assert record.word_count == 2 xliff_record = ( s.query(XliffRecord).filter(XliffRecord.parent_id == record.id).one() ) @@ -223,6 +228,7 @@ def test_process_task_sets_txt_records(session: Session): == "Soon after the characters enter Camp Greenbriar, read or paraphrase the following text:" ) assert not record.target + assert record.word_count == 13 assert s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 0 record = doc.records[1] @@ -231,6 +237,7 @@ def test_process_task_sets_txt_records(session: Session): == "“Hello, travelers!” calls an energetic giant sloth wearing a bracelet of claws and feathers." ) assert not record.target + assert record.word_count == 14 assert ( s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 91 if crlf @@ -243,6 +250,7 @@ def test_process_task_sets_txt_records(session: Session): == "The creature dangles from a nearby tree and waves a three-clawed paw." ) assert not record.target + assert record.word_count == 12 assert ( s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 184 if crlf @@ -252,6 +260,7 @@ def test_process_task_sets_txt_records(session: Session): record = doc.records[3] assert record.source == "“Fresh faces are always welcome in Camp Greenbriar!”" assert not record.target + assert record.word_count == 8 assert ( s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 254 if crlf @@ -262,6 +271,7 @@ def test_process_task_sets_txt_records(session: Session): assert record.source == "The sloth is named Razak." assert record.target == "Translation" assert record.target_source == RecordSource.translation_memory + assert record.word_count == 5 assert ( s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 310 if crlf @@ -274,6 +284,7 @@ def test_process_task_sets_txt_records(session: Session): == "He uses black bear stat block, with the following adjustments:" ) assert not record.target + assert record.word_count == 10 assert ( s.query(TxtRecord).filter_by(parent_id=record.id).one().offset == 336 if crlf diff --git a/backend/worker.py b/backend/worker.py index dd0f9ef..0086c86 100644 --- a/backend/worker.py +++ b/backend/worker.py @@ -24,6 +24,7 @@ from app.formats.xliff import XliffSegment, extract_xliff_content from app.glossary.models import GlossaryRecord from app.glossary.query import GlossaryQuery +from app.linguistic.word_count import count_words from app.models import DocumentStatus, MachineTranslationSettings, TaskStatus from app.schema import DocumentTask from app.translation_memory.models import TranslationMemoryRecord @@ -305,6 +306,7 @@ def create_doc_segments( target=segment.original_segment.translation or "", approved=segment.approved, target_source=segment.segment_source, + word_count=count_words(segment.original_segment.original), ) for segment in segments ] diff --git a/frontend/mocks/documentMocks.ts b/frontend/mocks/documentMocks.ts index b0ba6b5..c213c54 100644 --- a/frontend/mocks/documentMocks.ts +++ b/frontend/mocks/documentMocks.ts @@ -17,6 +17,7 @@ import {DocumentStatus} from '../src/client/schemas/DocumentStatus' import {DocumentRecordUpdate} from '../src/client/schemas/DocumentRecordUpdate' import {CommentResponse} from '../src/client/schemas/CommentResponse' import {DocumentRecord} from '../src/client/schemas/DocumentRecord' +import {DocumentWithRecordsCount} from '../src/client/schemas/DocumentWithRecordsCount' const segmentComments: CommentResponse[] = [ { @@ -228,12 +229,14 @@ const recordsData = { records: segments, } -const docs = [ +const docs: DocumentWithRecordsCount[] = [ { id: 1, created_by: 12, records_count: segments.length, approved_records_count: segments.filter(({approved}) => approved).length, + total_word_count: 20, + approved_word_count: 4, name: 'Some document', status: 'done' as DocumentStatus, type: 'XLIFF', diff --git a/frontend/src/client/schemas/DocumentWithRecordsCount.ts b/frontend/src/client/schemas/DocumentWithRecordsCount.ts index d9ac2c6..2e9424a 100644 --- a/frontend/src/client/schemas/DocumentWithRecordsCount.ts +++ b/frontend/src/client/schemas/DocumentWithRecordsCount.ts @@ -10,4 +10,6 @@ export interface DocumentWithRecordsCount { type: string approved_records_count: number records_count: number + approved_word_count: number + total_word_count: number } diff --git a/frontend/src/components/DocumentListItem.vue b/frontend/src/components/DocumentListItem.vue index 6aeb6ae..e14c98b 100644 --- a/frontend/src/components/DocumentListItem.vue +++ b/frontend/src/components/DocumentListItem.vue @@ -27,7 +27,12 @@ const classes = computed(() => { }) const progressBarTitle = computed(() => { - return `Segments: ${props.document.approved_records_count}/${props.document.records_count}` + return `Words: ${props.document.approved_word_count}/${props.document.total_word_count}` +}) + +const progressValue = computed(() => { + if (props.document.total_word_count === 0) return 0 + return (props.document.approved_word_count / props.document.total_word_count) * 100 }) const busy = ref(false) @@ -74,7 +79,7 @@ const deleteFile = async () => { diff --git a/frontend/src/views/DocView.vue b/frontend/src/views/DocView.vue index f5960d8..fb8e211 100644 --- a/frontend/src/views/DocView.vue +++ b/frontend/src/views/DocView.vue @@ -72,8 +72,8 @@ const documentDownloadLink = computed(() => { const translationProgress = computed(() => { const doc = document.value - if (!doc) return 0 - return (doc.approved_records_count / doc.records_count) * 100 + if (!doc || doc.total_word_count === 0) return 0 + return (doc.approved_word_count / doc.total_word_count) * 100 }) const updatePage = async (event: PageState) => { @@ -229,8 +229,8 @@ const showAddTermModal = ref(false) :value="translationProgress" :show-value="false" /> - {{ document?.approved_records_count }} / - {{ document?.records_count }} + {{ document?.approved_word_count }} / + {{ document?.total_word_count }} words