feat: add api.convert_to_readalong

ReadAlongs · Dec 6, 2024 · 02017c1 · 02017c1
1 parent 954211c
commit 02017c1
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 3 deletions.
diff --git a/readalongs/align.py b/readalongs/align.py
@@ -1191,7 +1191,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
 """
 
 
-def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
+def create_ras_from_text(lines: Iterable[str], text_languages: Sequence[str]) -> str:
     """Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
         Uses the line sequence to infer paragraph and sentence structure from plain text:
         Assumes a double blank line marks a page break, and a single blank line

diff --git a/readalongs/api.py b/readalongs/api.py
@@ -31,16 +31,26 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
                         you come accross such an exception and you believe the
                         problem is not in your own code.
  - log: any logging messages issued during execution
+
+Additional API function:
+
+convert_to_readalong()
+
 """
 
 import io
 import logging
-from typing import Optional, Tuple
+from dataclasses import dataclass
+from pathlib import Path
+from typing import List, Optional, Tuple
 
 import click
 
 from readalongs import cli
+from readalongs.align import create_ras_from_text
 from readalongs.log import LOGGER
+from readalongs.text.add_ids_to_xml import add_ids
+from readalongs.text.util import parse_xml
 from readalongs.util import JoinerCallbackForClick, get_langs_deferred
 
 
@@ -116,11 +126,13 @@ def make_xml(
     Returns: (status, exception, log_text)
     """
     # plaintextfile is not a file object if passed from click
+
     plaintextfile = (
         plaintextfile.name
         if isinstance(plaintextfile, click.utils.LazyFile)
         else plaintextfile
     )
+    xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile
     logging_stream = io.StringIO()
     logging_handler = logging.StreamHandler(logging_stream)
     try:
@@ -157,3 +169,75 @@ def prepare(*args, **kwargs):
         "readalongs.api.prepare() is deprecated. Please use make_xml() instead."
     )
     return make_xml(*args, **kwargs)
+
+
+@dataclass
+class Token:
+    """A token in a readalong: a word has a time and dur, a non-word does not."""
+
+    text: str
+    time: Optional[float]
+    dur: Optional[float]
+    is_word: bool
+
+    def __init__(
+        self,
+        text: str,
+        time: Optional[float] = None,
+        dur: Optional[float] = None,
+        is_word: Optional[bool] = None,
+    ):
+        """Create a word token:
+            t = Token("asdf", time=1.3, dur=.34) or t = Token("asdf", 1.3, .34)
+        Create a non-word token (e.g., punctuation, spacing):
+            t = Token(", ")
+        """
+        self.text = text
+        self.time = time
+        self.dur = dur
+        self.is_word = is_word if is_word is not None else bool(time is not None)
+
+
+def convert_to_readalong(sentences: List[List[Token]]) -> str:
+    """Convert a list of pages of tokens into a readalong XML string.
+
+    Args:
+        sentences: a list of sentences, each of which is a list of Token objects
+            Paragraph breaks are marked by a empty sentence (i.e., an empty list)
+            Page breaks are marked by two empty sentences in a row
+
+    Returns:
+        str: the readalong XML string, ready to print to a .readalong file
+    """
+    from lxml import etree
+
+    xml_text = create_ras_from_text(
+        ["".join(token.text for token in sentence) for sentence in sentences], ("und",)
+    )
+    xml = parse_xml(xml_text)
+    filtered_sentences = [sentence for sentence in sentences if sentence]
+    for sentence, sentence_xml in zip(filtered_sentences, xml.findall(".//s")):
+        sentence_xml.text = ""
+        for token in sentence:
+            if token.is_word:
+                w = etree.Element("w")
+                w.text = token.text
+                w.attrib["time"] = str(token.time)
+                w.attrib["dur"] = str(token.dur)
+                sentence_xml.append(w)
+            else:
+                if len(sentence_xml):  # if it has children
+                    if not sentence_xml[-1].tail:
+                        sentence_xml[-1].tail = ""
+                    sentence_xml[-1].tail += token.text
+                else:
+                    sentence_xml.text += token.text
+
+    xml = add_ids(xml)
+    xml_text = etree.tostring(
+        xml,
+        encoding="utf-8",
+        xml_declaration=True,
+    ).decode("utf8")
+
+    return xml_text + "\n"
diff --git a/test/test_api.py b/test/test_api.py
@@ -5,6 +5,7 @@
 """
 
 import os
+import re
 from contextlib import redirect_stderr
 from io import StringIO
 from unittest import main
@@ -13,7 +14,7 @@
 from basic_test_case import BasicTestCase
 from sound_swallower_stub import SoundSwallowerStub
 
-import readalongs.api as api
+from readalongs import api
 from readalongs.log import LOGGER
 
 
@@ -96,6 +97,60 @@ def test_deprecated_prepare(self):
             api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
         self.assertIn("deprecated", "\n".join(cm.output))
 
+    def test_convert_to_readalong(self):
+        sentences = [
+            [
+                api.Token("Bonjöûr,", 0.2, 1.0),
+                api.Token(" "),
+                api.Token("hello", 1.0, 0.2),
+                api.Token("!"),
+            ],
+            [api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
+            [],
+            [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
+            [],
+            [],
+            [
+                api.Token("("),
+                api.Token('"'),
+                api.Token("Page2", 5.2, 0.2),
+                api.Token("."),
+                api.Token('"'),
+                api.Token(")"),
+            ],
+        ]
+
+        readalong = api.convert_to_readalong(sentences)
+        # print(readalong)
+
+        # Make the reference by calling align with the same text and adjusting
+        # things we expect to be different.
+        sentences_as_text = "\n".join(
+            "".join(token.text for token in sentence) for sentence in sentences
+        )
+        with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
+            f.write(sentences_as_text)
+        result = api.align(
+            self.tempdir / "sentences.txt",
+            self.data_dir / "noise.mp3",
+            self.tempdir / "output",
+            ("und",),
+        )
+        if result[0] != 0:
+            print("align error:", result)
+        with open(self.tempdir / "output/www/output.readalong", encoding="utf8") as f:
+            align_result = f.read()
+
+        align_result = re.sub(r" ARPABET=\".*?\"", "", align_result)
+        align_result = re.sub(
+            r'<w (id=".*?") time=".*?" dur=".*?"',
+            r'<w time="ttt" dur="ddd" \1',
+            align_result,
+        )
+        readalong = re.sub(r"time=\".*?\"", 'time="ttt"', readalong)
+        readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
+        self.assertEqual(readalong, align_result.strip())
+
 
 if __name__ == "__main__":
     main()