Skip to content

Commit

Permalink
feat: add api.convert_to_readalong
Browse files Browse the repository at this point in the history
  • Loading branch information
joanise committed Dec 6, 2024
1 parent 954211c commit 02017c1
Show file tree
Hide file tree
Showing 3 changed files with 142 additions and 3 deletions.
2 changes: 1 addition & 1 deletion readalongs/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -1191,7 +1191,7 @@ def convert_to_xhtml(tokenized_xml, title="Book"):
"""


def create_ras_from_text(lines: Iterable[str], text_languages=Sequence[str]) -> str:
def create_ras_from_text(lines: Iterable[str], text_languages: Sequence[str]) -> str:
"""Create input xml in ReadAlong XML format (see static/read-along-1.2.dtd)
Uses the line sequence to infer paragraph and sentence structure from plain text:
Assumes a double blank line marks a page break, and a single blank line
Expand Down
86 changes: 85 additions & 1 deletion readalongs/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,26 @@ class like pathlib.Path. Warning: don't just use "/some/path/config.json"
you come accross such an exception and you believe the
problem is not in your own code.
- log: any logging messages issued during execution
Additional API function:
convert_to_readalong()
"""

import io
import logging
from typing import Optional, Tuple
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional, Tuple

import click

from readalongs import cli
from readalongs.align import create_ras_from_text
from readalongs.log import LOGGER
from readalongs.text.add_ids_to_xml import add_ids
from readalongs.text.util import parse_xml
from readalongs.util import JoinerCallbackForClick, get_langs_deferred


Expand Down Expand Up @@ -116,11 +126,13 @@ def make_xml(
Returns: (status, exception, log_text)
"""
# plaintextfile is not a file object if passed from click

plaintextfile = (
plaintextfile.name
if isinstance(plaintextfile, click.utils.LazyFile)
else plaintextfile
)
xmlfile = str(xmlfile) if isinstance(xmlfile, Path) else xmlfile
logging_stream = io.StringIO()
logging_handler = logging.StreamHandler(logging_stream)
try:
Expand Down Expand Up @@ -157,3 +169,75 @@ def prepare(*args, **kwargs):
"readalongs.api.prepare() is deprecated. Please use make_xml() instead."
)
return make_xml(*args, **kwargs)


@dataclass
class Token:
"""A token in a readalong: a word has a time and dur, a non-word does not."""

text: str
time: Optional[float]
dur: Optional[float]
is_word: bool

def __init__(
self,
text: str,
time: Optional[float] = None,
dur: Optional[float] = None,
is_word: Optional[bool] = None,
):
"""Create a word token:
t = Token("asdf", time=1.3, dur=.34) or t = Token("asdf", 1.3, .34)
Create a non-word token (e.g., punctuation, spacing):
t = Token(", ")
"""
self.text = text
self.time = time
self.dur = dur
self.is_word = is_word if is_word is not None else bool(time is not None)


def convert_to_readalong(sentences: List[List[Token]]) -> str:
"""Convert a list of pages of tokens into a readalong XML string.
Args:
sentences: a list of sentences, each of which is a list of Token objects
Paragraph breaks are marked by a empty sentence (i.e., an empty list)
Page breaks are marked by two empty sentences in a row
Returns:
str: the readalong XML string, ready to print to a .readalong file
"""
from lxml import etree

xml_text = create_ras_from_text(
["".join(token.text for token in sentence) for sentence in sentences], ("und",)
)
xml = parse_xml(xml_text)
filtered_sentences = [sentence for sentence in sentences if sentence]
for sentence, sentence_xml in zip(filtered_sentences, xml.findall(".//s")):
sentence_xml.text = ""
for token in sentence:
if token.is_word:
w = etree.Element("w")
w.text = token.text
w.attrib["time"] = str(token.time)
w.attrib["dur"] = str(token.dur)
sentence_xml.append(w)
else:
if len(sentence_xml): # if it has children
if not sentence_xml[-1].tail:
sentence_xml[-1].tail = ""
sentence_xml[-1].tail += token.text
else:
sentence_xml.text += token.text

xml = add_ids(xml)
xml_text = etree.tostring(
xml,
encoding="utf-8",
xml_declaration=True,
).decode("utf8")

return xml_text + "\n"
57 changes: 56 additions & 1 deletion test/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import os
import re
from contextlib import redirect_stderr
from io import StringIO
from unittest import main
Expand All @@ -13,7 +14,7 @@
from basic_test_case import BasicTestCase
from sound_swallower_stub import SoundSwallowerStub

import readalongs.api as api
from readalongs import api
from readalongs.log import LOGGER


Expand Down Expand Up @@ -96,6 +97,60 @@ def test_deprecated_prepare(self):
api.prepare(self.data_dir / "ej-fra.txt", os.devnull, ("fra",))
self.assertIn("deprecated", "\n".join(cm.output))

def test_convert_to_readalong(self):
sentences = [
[
api.Token("Bonjöûr,", 0.2, 1.0),
api.Token(" "),
api.Token("hello", 1.0, 0.2),
api.Token("!"),
],
[api.Token("Sentence2", 4.2, 0.2), api.Token("!")],
[],
[api.Token("Paragraph2", 4.2, 0.2), api.Token(".")],
[],
[],
[
api.Token("("),
api.Token('"'),
api.Token("Page2", 5.2, 0.2),
api.Token("."),
api.Token('"'),
api.Token(")"),
],
]

readalong = api.convert_to_readalong(sentences)
# print(readalong)

# Make the reference by calling align with the same text and adjusting
# things we expect to be different.
sentences_as_text = "\n".join(
"".join(token.text for token in sentence) for sentence in sentences
)
with open(self.tempdir / "sentences.txt", "w", encoding="utf8") as f:
f.write(sentences_as_text)
result = api.align(
self.tempdir / "sentences.txt",
self.data_dir / "noise.mp3",
self.tempdir / "output",
("und",),
)
if result[0] != 0:
print("align error:", result)
with open(self.tempdir / "output/www/output.readalong", encoding="utf8") as f:
align_result = f.read()

align_result = re.sub(r" ARPABET=\".*?\"", "", align_result)
align_result = re.sub(
r'<w (id=".*?") time=".*?" dur=".*?"',
r'<w time="ttt" dur="ddd" \1',
align_result,
)
readalong = re.sub(r"time=\".*?\"", 'time="ttt"', readalong)
readalong = re.sub(r"dur=\".*?\"", 'dur="ddd"', readalong)
self.assertEqual(readalong, align_result.strip())


if __name__ == "__main__":
main()

0 comments on commit 02017c1

Please sign in to comment.