feat: add convert_to_offline_html() to api.py

joanise · joanise · commit 975f3c48fb4a · 2024-12-13T11:45:06.000-05:00
diff --git a/readalongs/api.py b/readalongs/api.py
@@ -45,15 +45,18 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
 import io
 import logging
 import os
+import tempfile
 from dataclasses import dataclass
 from typing import Optional, Sequence, Tuple, Union
 
 import click
+from lxml import etree
 
 from readalongs import cli
 from readalongs.align import create_ras_from_text
 from readalongs.log import LOGGER
 from readalongs.text.add_ids_to_xml import add_ids
+from readalongs.text.make_package import create_web_component_html
 from readalongs.text.util import parse_xml
 from readalongs.util import JoinerCallbackForClick, get_langs_deferred
 
@@ -222,12 +225,11 @@ def convert_to_readalong(
             Page breaks are marked by two empty sentences in a row
         language: list of languages to declare at the top of the readalong
             (has no functional effect since g2p is not applied, it's only metadata)
+        offline_html: if True, return the full offline HTML instead of just the .readlong XML
 
     Returns:
-        str: the readalong XML string, ready to print to a .readalong file
+        str: the readalong XML or HTML file contents, ready to print to .readalong or .html
     """
-    from lxml import etree
-
     xml_text = create_ras_from_text(
         ["".join(token.text for token in sentence) for sentence in sentences],
         language,
@@ -259,3 +261,37 @@ def convert_to_readalong(
     ).decode("utf8")
 
     return xml_text + "\n"
+
+
+def convert_to_offline_html(
+    sentences: Sequence[Sequence[Token]],
+    audio_file_name: Union[str, os.PathLike],
+    language: Sequence[str] = ("und",),
+) -> str:
+    """Convert a list of sentences/paragraphs/pages of tokens, with corresponding autdio,
+    into a readalong Offline HTML
+
+    Args:
+        sentences: a list of sentences, each of which is a list of Token objects
+            Paragraph breaks are marked by a empty sentence (i.e., an empty list)
+            Page breaks are marked by two empty sentences in a row
+        audio_file_name: the name of the audio file to be used in the offline HTML
+        language: list of languages to declare at the top of the readalong
+            (has no functional effect since g2p is not applied, it's only metadata)
+
+    Returns:
+        str: the readalong XML or HTML file contents, ready to print to .readalong or .html
+    """
+
+    readalong_xml = convert_to_readalong(sentences, language)
+    try:
+        readalong_file = tempfile.NamedTemporaryFile(
+            "w", encoding="utf8", delete=False, suffix=".readalong"
+        )
+        readalong_file.write(readalong_xml)
+        readalong_file.close()
+        print(readalong_file.name)
+        offline_html = create_web_component_html(readalong_file.name, audio_file_name)
+        return offline_html
+    finally:
+        os.unlink(readalong_file.name)
diff --git a/test/test_api.py b/test/test_api.py
@@ -97,36 +97,38 @@ def test_deprecated_prepare(self):
             api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
         self.assertIn("deprecated", "\n".join(cm.output))
 
+    sentences_to_convert = [
+        [
+            api.Token("Bonjöûr,", 0.2, 1.0),
+            api.Token(" "),
+            api.Token("hello", 1.0, 0.2),
+            api.Token("!"),
+        ],
+        [api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
+        [],
+        [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
+        [],
+        [],
+        [
+            api.Token("("),
+            api.Token('"'),
+            api.Token("Page2", 5.2, 0.2),
+            api.Token("."),
+            api.Token('"'),
+            api.Token(")"),
+        ],
+    ]
+
     def test_convert_to_readalong(self):
-        sentences = [
-            [
-                api.Token("Bonjöûr,", 0.2, 1.0),
-                api.Token(" "),
-                api.Token("hello", 1.0, 0.2),
-                api.Token("!"),
-            ],
-            [api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
-            [],
-            [api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
-            [],
-            [],
-            [
-                api.Token("("),
-                api.Token('"'),
-                api.Token("Page2", 5.2, 0.2),
-                api.Token("."),
-                api.Token('"'),
-                api.Token(")"),
-            ],
-        ]
-
-        readalong = api.convert_to_readalong(sentences)
+
+        readalong = api.convert_to_readalong(self.sentences_to_convert)
         # print(readalong)
 
         # Make the reference by calling align with the same text and adjusting
         # things we expect to be different.
         sentences_as_text = "\n".join(
-            "".join(token.text for token in sentence) for sentence in sentences
+            "".join(token.text for token in sentence)
+            for sentence in self.sentences_to_convert
         )
         with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
             f.write(sentences_as_text)
@@ -152,6 +154,21 @@ def test_convert_to_readalong(self):
         readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
         self.assertEqual(readalong, align_result)
 
+    def test_convert_to_offline_html(self):
+        html = api.convert_to_offline_html(
+            self.sentences_to_convert, str(self.data_dir / "noise.mp3")
+        )
+        with open("test.html", "w", encoding="utf8") as f:
+            f.write(html)
+        # print(html)
+        self.assertIn("<html", html)
+        self.assertIn("<body", html)
+        self.assertIn('<meta name="generator" content="@readalongs/studio (cli)', html)
+        self.assertIn('<read-along href="data:application/readalong+xml;base64', html)
+        self.assertIn('audio="data:audio/', html)
+        self.assertIn("<span slot='read-along-header'>", html)
+        self.assertIn("<span slot='read-along-subheader'>", html)
+
 
 if __name__ == "__main__":
     main()