Skip to content

Commit 975f3c4

Browse files
committed
feat: add convert_to_offline_html() to api.py
1 parent b3f83fd commit 975f3c4

File tree

2 files changed

+80
-27
lines changed

2 files changed

+80
-27
lines changed

readalongs/api.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -45,15 +45,18 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
4545
import io
4646
import logging
4747
import os
48+
import tempfile
4849
from dataclasses import dataclass
4950
from typing import Optional, Sequence, Tuple, Union
5051

5152
import click
53+
from lxml import etree
5254

5355
from readalongs import cli
5456
from readalongs.align import create_ras_from_text
5557
from readalongs.log import LOGGER
5658
from readalongs.text.add_ids_to_xml import add_ids
59+
from readalongs.text.make_package import create_web_component_html
5760
from readalongs.text.util import parse_xml
5861
from readalongs.util import JoinerCallbackForClick, get_langs_deferred
5962

@@ -222,12 +225,11 @@ def convert_to_readalong(
222225
Page breaks are marked by two empty sentences in a row
223226
language: list of languages to declare at the top of the readalong
224227
(has no functional effect since g2p is not applied, it's only metadata)
228+
offline_html: if True, return the full offline HTML instead of just the .readlong XML
225229
226230
Returns:
227-
str: the readalong XML string, ready to print to a .readalong file
231+
str: the readalong XML or HTML file contents, ready to print to .readalong or .html
228232
"""
229-
from lxml import etree
230-
231233
xml_text = create_ras_from_text(
232234
["".join(token.text for token in sentence) for sentence in sentences],
233235
language,
@@ -259,3 +261,37 @@ def convert_to_readalong(
259261
).decode("utf8")
260262

261263
return xml_text + "\n"
264+
265+
266+
def convert_to_offline_html(
267+
sentences: Sequence[Sequence[Token]],
268+
audio_file_name: Union[str, os.PathLike],
269+
language: Sequence[str] = ("und",),
270+
) -> str:
271+
"""Convert a list of sentences/paragraphs/pages of tokens, with corresponding autdio,
272+
into a readalong Offline HTML
273+
274+
Args:
275+
sentences: a list of sentences, each of which is a list of Token objects
276+
Paragraph breaks are marked by a empty sentence (i.e., an empty list)
277+
Page breaks are marked by two empty sentences in a row
278+
audio_file_name: the name of the audio file to be used in the offline HTML
279+
language: list of languages to declare at the top of the readalong
280+
(has no functional effect since g2p is not applied, it's only metadata)
281+
282+
Returns:
283+
str: the readalong XML or HTML file contents, ready to print to .readalong or .html
284+
"""
285+
286+
readalong_xml = convert_to_readalong(sentences, language)
287+
try:
288+
readalong_file = tempfile.NamedTemporaryFile(
289+
"w", encoding="utf8", delete=False, suffix=".readalong"
290+
)
291+
readalong_file.write(readalong_xml)
292+
readalong_file.close()
293+
print(readalong_file.name)
294+
offline_html = create_web_component_html(readalong_file.name, audio_file_name)
295+
return offline_html
296+
finally:
297+
os.unlink(readalong_file.name)

test/test_api.py

Lines changed: 41 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -97,36 +97,38 @@ def test_deprecated_prepare(self):
9797
api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
9898
self.assertIn("deprecated", "\n".join(cm.output))
9999

100+
sentences_to_convert = [
101+
[
102+
api.Token("Bonjöûr,", 0.2, 1.0),
103+
api.Token(" "),
104+
api.Token("hello", 1.0, 0.2),
105+
api.Token("!"),
106+
],
107+
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
108+
[],
109+
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
110+
[],
111+
[],
112+
[
113+
api.Token("("),
114+
api.Token('"'),
115+
api.Token("Page2", 5.2, 0.2),
116+
api.Token("."),
117+
api.Token('"'),
118+
api.Token(")"),
119+
],
120+
]
121+
100122
def test_convert_to_readalong(self):
101-
sentences = [
102-
[
103-
api.Token("Bonjöûr,", 0.2, 1.0),
104-
api.Token(" "),
105-
api.Token("hello", 1.0, 0.2),
106-
api.Token("!"),
107-
],
108-
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
109-
[],
110-
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
111-
[],
112-
[],
113-
[
114-
api.Token("("),
115-
api.Token('"'),
116-
api.Token("Page2", 5.2, 0.2),
117-
api.Token("."),
118-
api.Token('"'),
119-
api.Token(")"),
120-
],
121-
]
122-
123-
readalong = api.convert_to_readalong(sentences)
123+
124+
readalong = api.convert_to_readalong(self.sentences_to_convert)
124125
# print(readalong)
125126

126127
# Make the reference by calling align with the same text and adjusting
127128
# things we expect to be different.
128129
sentences_as_text = "\n".join(
129-
"".join(token.text for token in sentence) for sentence in sentences
130+
"".join(token.text for token in sentence)
131+
for sentence in self.sentences_to_convert
130132
)
131133
with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
132134
f.write(sentences_as_text)
@@ -152,6 +154,21 @@ def test_convert_to_readalong(self):
152154
readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
153155
self.assertEqual(readalong, align_result)
154156

157+
def test_convert_to_offline_html(self):
158+
html = api.convert_to_offline_html(
159+
self.sentences_to_convert, str(self.data_dir / "noise.mp3")
160+
)
161+
with open("test.html", "w", encoding="utf8") as f:
162+
f.write(html)
163+
# print(html)
164+
self.assertIn("<html", html)
165+
self.assertIn("<body", html)
166+
self.assertIn('<meta name="generator" content="@readalongs/studio (cli)', html)
167+
self.assertIn('<read-along href="data:application/readalong+xml;base64', html)
168+
self.assertIn('audio="data:audio/', html)
169+
self.assertIn("<span slot='read-along-header'>", html)
170+
self.assertIn("<span slot='read-along-subheader'>", html)
171+
155172

156173
if __name__ == "__main__":
157174
main()

0 commit comments

Comments
 (0)