Skip to content

Commit c0c8f73

Browse files
committed
Remove BeautifulSoup4 dependency
1 parent 5ff9fad commit c0c8f73

16 files changed

+219
-170
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,7 @@ in several formats, depending on its application:
6363
2. [Python](https://python.org/) 2.7 (Linux, OS X, Windows) or 3.5 or later (Linux, OS X)
6464
3. [FFmpeg](https://www.ffmpeg.org/)
6565
4. [eSpeak](http://espeak.sourceforge.net/)
66-
5. Python packages `BeautifulSoup4`, `lxml`, and `numpy`
66+
5. Python packages `lxml`, `numpy` and `Jinja2`
6767
6. Python headers to compile the Python C/C++ extensions (optional but strongly recommended)
6868
7. A shell supporting UTF-8 (optional but strongly recommended)
6969

README.rst

+1-1
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ System Requirements
7979
later (Linux, OS X)
8080
3. `FFmpeg <https://www.ffmpeg.org/>`__
8181
4. `eSpeak <http://espeak.sourceforge.net/>`__
82-
5. Python packages ``BeautifulSoup4``, ``lxml``, and ``numpy``
82+
5. Python packages ``lxml``, ``numpy`` and ``Jinja2``
8383
6. Python headers to compile the Python C/C++ extensions (optional but
8484
strongly recommended)
8585
7. A shell supporting UTF-8 (optional but strongly recommended)

aeneas/executetask.py

+5-2
Original file line numberDiff line numberDiff line change
@@ -143,14 +143,17 @@ def execute(self):
143143
f"The text file of the task has {len(self.task.text_file):d} fragments, "
144144
f"more than the maximum allowed ({task_max_text_length:d})."
145145
)
146-
if self.task.text_file.chars == 0:
146+
if not self.task.text_file.chars:
147147
raise ExecuteTaskInputError("The task text file seems to have empty text")
148148

149149
logger.debug("Both audio and text input file are present")
150150

151151
# execute
152152
self.step_index = 1
153-
if self.task.configuration[gc.PPN_TASK_IS_TEXT_FILE_FORMAT] in TextFileFormat.MULTILEVEL_VALUES:
153+
if (
154+
self.task.configuration[gc.PPN_TASK_IS_TEXT_FILE_FORMAT]
155+
in TextFileFormat.MULTILEVEL_VALUES
156+
):
154157
self._execute_multi_level_task()
155158
else:
156159
self._execute_single_level_task()

aeneas/textfile.py

+128-87
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,13 @@
3333
import collections.abc
3434
import contextlib
3535
import io
36+
import itertools
3637
import logging
38+
import operator
3739
import re
3840
import typing
3941

40-
import bs4
42+
import lxml.etree as ET
4143

4244
from aeneas.idsortingalgorithm import IDSortingAlgorithm
4345
from aeneas.tree import Tree
@@ -48,10 +50,6 @@
4850
logger = logging.getLogger(__name__)
4951

5052

51-
def get_soup(buf: typing.IO[bytes], *, parse_only=None) -> bs4.BeautifulSoup:
52-
return bs4.BeautifulSoup(buf, "lxml", parse_only=parse_only)
53-
54-
5553
class TextFileFormat:
5654
"""
5755
Enumeration of the supported formats for text files.
@@ -737,74 +735,114 @@ def _read_munparsed(cls, buf: typing.IO[bytes], parameters: dict) -> Tree:
737735
:param buf: the bytes file object
738736
"""
739737

740-
def nodes_at_level(root, level: int):
741-
"""Return a dict with the bs4 filter parameters"""
742-
LEVEL_TO_REGEX_MAP = [
743-
None,
744-
gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX,
745-
gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX,
746-
gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX,
747-
]
748-
attribute_name = "id"
749-
regex_string = parameters[LEVEL_TO_REGEX_MAP[level]]
750-
logger.debug("Regex for %s: %r", attribute_name, regex_string)
751-
regex = re.compile(rf".*\b{regex_string}\b.*")
752-
return root.find_all(attrs={attribute_name: regex})
738+
def parse():
739+
l1_re = re.compile(
740+
rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX]}\b.*"
741+
)
742+
l2_re = re.compile(
743+
rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX]}\b.*"
744+
)
745+
l3_re = re.compile(
746+
rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX]}\b.*"
747+
)
753748

754-
# TODO better and/or parametric parsing,
755-
# for example, removing tags but keeping text, etc.
756-
logger.debug("Parsing fragments from munparsed text format")
757-
# transform text in a soup object
758-
soup = get_soup(buf)
759-
# extract according to id_regex
760-
logger.debug("Finding L1 elements")
761-
tree = Tree()
762-
for l1_node in nodes_at_level(soup, 1):
763-
has_word = False
749+
l1_id = l2_id = l3_id = None
764750
try:
765-
l1_id = l1_node["id"]
766-
logger.debug("Found L1 node with ID: %r", l1_id)
767-
paragraph_node = Tree()
768-
paragraph_text_parts = []
769-
for l2_node in nodes_at_level(l1_node, 2):
770-
l2_id = l2_node["id"]
771-
logger.debug("Found L2 node with ID: %r", l2_id)
772-
sentence_node = Tree()
773-
paragraph_node.add_child(sentence_node)
774-
sentence_text_parts = []
775-
for l3_node in nodes_at_level(l2_node, 3):
776-
l3_id = l3_node["id"]
777-
l3_text = l3_node.text
778-
logger.debug("Found L3 node with ID: %r", l3_id)
779-
logger.debug("Found L3 node with text: %r", l3_text)
780-
word_fragment = TextFragment(
781-
identifier=l3_id, lines=[l3_text], filtered_lines=[l3_text]
782-
)
783-
word_node = Tree(value=word_fragment)
784-
sentence_node.add_child(word_node)
785-
sentence_text_parts.append(l3_text)
786-
has_word = True
787-
sentence_text = " ".join(sentence_text_parts)
788-
paragraph_text_parts.append(sentence_text)
789-
sentence_node.value = TextFragment(
790-
identifier=l2_id,
791-
lines=[sentence_text],
792-
filtered_lines=[sentence_text],
751+
for event, node in ET.iterparse(
752+
buf, events=("start", "end"), html=True
753+
):
754+
node_id = node.attrib.get("id")
755+
if not node_id:
756+
if event == "end":
757+
node.clear()
758+
continue
759+
760+
if event == "start":
761+
if l1_id is None and l1_re.match(node_id):
762+
l1_id = node_id
763+
continue
764+
elif (
765+
l1_id is not None and l2_id is None and l2_re.match(node_id)
766+
):
767+
l2_id = node_id
768+
continue
769+
elif (
770+
l1_id is not None
771+
and l2_id is not None
772+
and l3_id is None
773+
and l3_re.match(node_id)
774+
):
775+
l3_id = node_id
776+
continue
777+
elif event == "end":
778+
if node_id == l1_id:
779+
if l2_id is not None:
780+
raise AssertionError("Expected l2_id to be empty")
781+
if l3_id is not None:
782+
raise AssertionError("Expected l3_id to be empty")
783+
l1_id = None
784+
elif node_id == l2_id:
785+
if l1_id is None:
786+
raise AssertionError("Expected l1_id to not be empty")
787+
if l3_id is not None:
788+
raise AssertionError("Expected l3_id to be empty")
789+
l2_id = None
790+
elif node_id == l3_id:
791+
if l1_id is None:
792+
raise AssertionError("Expected l1_id to not be empty")
793+
if l2_id is None:
794+
raise AssertionError("Expected l2_id to not be empty")
795+
796+
yield l1_id, l2_id, l3_id, node.text
797+
l3_id = None
798+
799+
node.clear()
800+
except ET.XMLSyntaxError as e:
801+
# FIXME: This is a very ugly workaround for empty XML file, find a better way.
802+
if e.msg == "no element found":
803+
return
804+
805+
raise
806+
807+
tree = Tree()
808+
for l1_id, l1_items in itertools.groupby(parse(), key=operator.itemgetter(0)):
809+
logger.debug("Found L1 node with ID: %r", l1_id)
810+
paragraph_node = Tree()
811+
paragraph_text_parts = []
812+
for l2_id, l2_items in itertools.groupby(
813+
l1_items, key=operator.itemgetter(1)
814+
):
815+
logger.debug("Found L2 node with ID: %r", l2_id)
816+
sentence_node = Tree()
817+
paragraph_node.add_child(sentence_node)
818+
sentence_text_parts = []
819+
for _, _, l3_id, l3_text in l2_items:
820+
logger.debug(
821+
"Found L3 node with ID: %r and text: %r", l3_id, l3_text
793822
)
794-
logger.debug("Found L2 node with text: %r", sentence_text)
795-
if has_word:
796-
paragraph_text = " ".join(paragraph_text_parts)
797-
paragraph_node.value = TextFragment(
798-
identifier=l1_id,
799-
lines=[paragraph_text],
800-
filtered_lines=[paragraph_text],
823+
word_fragment = TextFragment(
824+
identifier=l3_id, lines=[l3_text], filtered_lines=[l3_text]
801825
)
802-
tree.add_child(paragraph_node)
803-
logger.debug("Found L1 node with text: %r", paragraph_text)
804-
else:
805-
logger.debug("Found L1 node but it has no words, skipping")
806-
except KeyError as exc:
807-
logger.warning("KeyError (%s) while parsing a L1 node", exc)
826+
word_node = Tree(value=word_fragment)
827+
sentence_node.add_child(word_node)
828+
sentence_text_parts.append(l3_text)
829+
sentence_text = " ".join(sentence_text_parts)
830+
paragraph_text_parts.append(sentence_text)
831+
sentence_node.value = TextFragment(
832+
identifier=l2_id,
833+
lines=[sentence_text],
834+
filtered_lines=[sentence_text],
835+
)
836+
logger.debug("Found L2 node with text: %r", sentence_text)
837+
838+
paragraph_text = " ".join(paragraph_text_parts)
839+
paragraph_node.value = TextFragment(
840+
identifier=l1_id,
841+
lines=[paragraph_text],
842+
filtered_lines=[paragraph_text],
843+
)
844+
tree.add_child(paragraph_node)
845+
logger.debug("Found L1 node with text: %r", paragraph_text)
808846

809847
return tree
810848

@@ -879,8 +917,8 @@ def _read_plain(cls, buf: typing.IO[bytes], parameters: dict) -> Tree:
879917
def _get_node_text(node, *, read_img_alt: bool) -> str:
880918
if node.text:
881919
return node.text
882-
elif read_img_alt and node.name == "img":
883-
alt = node.attrs.get("alt")
920+
elif read_img_alt and node.tag == "img":
921+
alt = node.attrib.get("alt")
884922
if alt is not None:
885923
return alt
886924

@@ -897,29 +935,32 @@ def _read_unparsed(
897935
:param bool read_img_alt: if True, read text from `<img/>` tag `alt` attribute
898936
"""
899937

900-
def make_soup_strainer() -> bs4.SoupStrainer:
901-
return bs4.SoupStrainer(
902-
id=re.compile(
903-
rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX]}\b.*"
904-
)
905-
)
906-
907938
# TODO better and/or parametric parsing,
908939
# for example, removing tags but keeping text, etc.
909940
logger.debug("Parsing fragments from unparsed text format")
910941

911-
# transform text in a soup object
912-
soup = get_soup(buf, parse_only=make_soup_strainer())
913-
914-
# extract according to id_regex
915942
text_from_id = {}
916943
ids = []
917-
for node in soup.find_all():
918-
node_id = node["id"]
919-
node_text = cls._get_node_text(node, read_img_alt=read_img_alt)
944+
id_regex = re.compile(
945+
rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX]}\b.*"
946+
)
947+
948+
try:
949+
for _, node in ET.iterparse(buf, events=("end",), html=True):
950+
node_id = node.attrib.get("id")
951+
if node_id and id_regex.match(node_id):
952+
text_from_id[node_id] = cls._get_node_text(
953+
node, read_img_alt=read_img_alt
954+
)
955+
ids.append(node_id)
956+
957+
node.clear()
958+
except ET.XMLSyntaxError as e:
959+
# FIXME: This is very ugly, find a better way.
960+
if e.msg == "no element found":
961+
return Tree()
920962

921-
text_from_id[node_id] = node_text
922-
ids.append(node_id)
963+
raise
923964

924965
# sort by ID as requested
925966
id_sort = gf.safe_get(

aeneas_check_setup.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ def check_import():
9797
" A. you did not download/git-clone the aeneas package properly; or"
9898
)
9999
print_info(" B. you did not install the required Python packages:")
100-
print_info(" 1. BeautifulSoup4")
101-
print_info(" 2. lxml")
102-
print_info(" 3. numpy")
100+
print_info(" 1. lxml")
101+
print_info(" 2. numpy")
102+
print_info(" 3. Jinja2")
103103
except Exception as e:
104104
print_error(e)
105105
return True

bin/aeneas_check_setup.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ def check_import():
9797
" A. you did not download/git-clone the aeneas package properly; or"
9898
)
9999
print_info(" B. you did not install the required Python packages:")
100-
print_info(" 1. BeautifulSoup4")
101-
print_info(" 2. lxml")
102-
print_info(" 3. numpy")
100+
print_info(" 1. lxml")
101+
print_info(" 2. numpy")
102+
print_info(" 3. Jinja2")
103103
except Exception as e:
104104
print_error(e)
105105
return True

check_dependencies.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -97,9 +97,9 @@ def check_import():
9797
" A. you did not download/git-clone the aeneas package properly; or"
9898
)
9999
print_info(" B. you did not install the required Python packages:")
100-
print_info(" 1. BeautifulSoup4")
101-
print_info(" 2. lxml")
102-
print_info(" 3. numpy")
100+
print_info(" 1. lxml")
101+
print_info(" 2. numpy")
102+
print_info(" 3. Jinja2")
103103
except Exception as e:
104104
print_error(e)
105105
return True

debian/control

+2-2
Original file line numberDiff line numberDiff line change
@@ -6,10 +6,10 @@ Build-Depends: debhelper (>= 9.0.0),
66
dh-python,
77
python-all-dev,
88
python-setuptools,
9-
python-numpy, python-lxml, python-bs4,
9+
python-numpy, python-lxml,
1010
python3-all-dev,
1111
python3-setuptools,
12-
python3-numpy, python3-lxml, python3-bs4,
12+
python3-numpy, python3-lxml,
1313
libasound2-dev, libsndfile1-dev, libespeak-dev
1414
Standards-Version: 4.1.4
1515
Homepage: https://github.com/readbeyond/aeneas

docs/source/libtutorial.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -127,12 +127,12 @@ Dependencies
127127

128128
* ``numpy`` (v1.9 or later)
129129
* ``lxml`` (v3.6.0 or later)
130-
* ``BeautifulSoup`` (v4.5.1 or later)
130+
* ``Jinja2`` (v3.0.0 or later)
131131

132132
Only ``numpy`` is actually needed, as it is heavily used for the alignment computation.
133133

134-
The other two dependencies (``lxml`` and ``BeautifulSoup``) are needed
135-
only if you use XML-like input or output formats.
134+
``lxml`` is needed only if you use XML-like input or output formats.
135+
``Jinja2`` is needed if you want to output finetuneas HTML.
136136
However, since they are popular Python packages, to avoid complex import testing,
137137
they are listed as requirements.
138138
This choice might change in the future.

requirements.txt

-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,3 @@
1-
BeautifulSoup4>=4.5.1
21
lxml>=3.6.0
32
numpy>=1.9
43
Jinja2>=3.0.0

setupmeta.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@
4949
# required packages to install
5050
# NOTE: always use exact version numbers
5151
# NOTE: this list should be the same as requirements.txt
52-
PKG_INSTALL_REQUIRES = ["BeautifulSoup4>=4.5.1", "lxml>=3.6.0", "numpy>=1.9"]
52+
PKG_INSTALL_REQUIRES = ["lxml>=3.6.0", "numpy>=1.9", "Jinja2>=3.0.0"]
5353

5454
# packages to be distributed
5555
# NOTE: not including the aeneas.test package to keep the size small

tests/test_sd.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -33,10 +33,7 @@ class TestSD(BaseCase):
3333
def load(self):
3434
audio_file_mfcc = AudioFileMFCC(self.file_path(self.AUDIO_FILE))
3535
with open(self.file_path(self.TEXT_FILE), mode="rb") as text_f:
36-
text_file = TextFile.load(
37-
text_f,
38-
file_format=TextFileFormat.PLAIN
39-
)
36+
text_file = TextFile.load(text_f, file_format=TextFileFormat.PLAIN)
4037
text_file.set_language(Language.ENG)
4138
return SD(audio_file_mfcc, text_file)
4239

0 commit comments

Comments
 (0)