Remove BeautifulSoup4 dependency

naglis · naglis · commit c0c8f733e40b · 2025-02-23T11:19:11.000+02:00
diff --git a/README.md b/README.md
@@ -63,7 +63,7 @@ in several formats, depending on its application:
 2. [Python](https://python.org/) 2.7 (Linux, OS X, Windows) or 3.5 or later (Linux, OS X)
 3. [FFmpeg](https://www.ffmpeg.org/)
 4. [eSpeak](http://espeak.sourceforge.net/)
-5. Python packages `BeautifulSoup4`, `lxml`, and `numpy`
+5. Python packages `lxml`, `numpy` and `Jinja2`
 6. Python headers to compile the Python C/C++ extensions (optional but strongly recommended)
 7. A shell supporting UTF-8 (optional but strongly recommended)
 
diff --git a/README.rst b/README.rst
@@ -79,7 +79,7 @@ System Requirements
    later (Linux, OS X)
 3. `FFmpeg <https://www.ffmpeg.org/>`__
 4. `eSpeak <http://espeak.sourceforge.net/>`__
-5. Python packages ``BeautifulSoup4``, ``lxml``, and ``numpy``
+5. Python packages ``lxml``, ``numpy`` and ``Jinja2``
 6. Python headers to compile the Python C/C++ extensions (optional but
    strongly recommended)
 7. A shell supporting UTF-8 (optional but strongly recommended)
diff --git a/aeneas/executetask.py b/aeneas/executetask.py
@@ -143,14 +143,17 @@ def execute(self):
                 f"The text file of the task has {len(self.task.text_file):d} fragments, "
                 f"more than the maximum allowed ({task_max_text_length:d})."
             )
-        if self.task.text_file.chars == 0:
+        if not self.task.text_file.chars:
             raise ExecuteTaskInputError("The task text file seems to have empty text")
 
         logger.debug("Both audio and text input file are present")
 
         # execute
         self.step_index = 1
-        if self.task.configuration[gc.PPN_TASK_IS_TEXT_FILE_FORMAT] in TextFileFormat.MULTILEVEL_VALUES:
+        if (
+            self.task.configuration[gc.PPN_TASK_IS_TEXT_FILE_FORMAT]
+            in TextFileFormat.MULTILEVEL_VALUES
+        ):
             self._execute_multi_level_task()
         else:
             self._execute_single_level_task()
diff --git a/aeneas/textfile.py b/aeneas/textfile.py
@@ -33,11 +33,13 @@
 import collections.abc
 import contextlib
 import io
+import itertools
 import logging
+import operator
 import re
 import typing
 
-import bs4
+import lxml.etree as ET
 
 from aeneas.idsortingalgorithm import IDSortingAlgorithm
 from aeneas.tree import Tree
@@ -48,10 +50,6 @@
 logger = logging.getLogger(__name__)
 
 
-def get_soup(buf: typing.IO[bytes], *, parse_only=None) -> bs4.BeautifulSoup:
-    return bs4.BeautifulSoup(buf, "lxml", parse_only=parse_only)
-
-
 class TextFileFormat:
     """
     Enumeration of the supported formats for text files.
@@ -737,74 +735,114 @@ def _read_munparsed(cls, buf: typing.IO[bytes], parameters: dict) -> Tree:
         :param buf: the bytes file object
         """
 
-        def nodes_at_level(root, level: int):
-            """Return a dict with the bs4 filter parameters"""
-            LEVEL_TO_REGEX_MAP = [
-                None,
-                gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX,
-                gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX,
-                gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX,
-            ]
-            attribute_name = "id"
-            regex_string = parameters[LEVEL_TO_REGEX_MAP[level]]
-            logger.debug("Regex for %s: %r", attribute_name, regex_string)
-            regex = re.compile(rf".*\b{regex_string}\b.*")
-            return root.find_all(attrs={attribute_name: regex})
+        def parse():
+            l1_re = re.compile(
+                rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX]}\b.*"
+            )
+            l2_re = re.compile(
+                rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX]}\b.*"
+            )
+            l3_re = re.compile(
+                rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX]}\b.*"
+            )
 
-        # TODO better and/or parametric parsing,
-        #      for example, removing tags but keeping text, etc.
-        logger.debug("Parsing fragments from munparsed text format")
-        # transform text in a soup object
-        soup = get_soup(buf)
-        # extract according to id_regex
-        logger.debug("Finding L1 elements")
-        tree = Tree()
-        for l1_node in nodes_at_level(soup, 1):
-            has_word = False
+            l1_id = l2_id = l3_id = None
             try:
-                l1_id = l1_node["id"]
-                logger.debug("Found L1 node with ID: %r", l1_id)
-                paragraph_node = Tree()
-                paragraph_text_parts = []
-                for l2_node in nodes_at_level(l1_node, 2):
-                    l2_id = l2_node["id"]
-                    logger.debug("Found L2 node with ID: %r", l2_id)
-                    sentence_node = Tree()
-                    paragraph_node.add_child(sentence_node)
-                    sentence_text_parts = []
-                    for l3_node in nodes_at_level(l2_node, 3):
-                        l3_id = l3_node["id"]
-                        l3_text = l3_node.text
-                        logger.debug("Found L3 node with ID: %r", l3_id)
-                        logger.debug("Found L3 node with text: %r", l3_text)
-                        word_fragment = TextFragment(
-                            identifier=l3_id, lines=[l3_text], filtered_lines=[l3_text]
-                        )
-                        word_node = Tree(value=word_fragment)
-                        sentence_node.add_child(word_node)
-                        sentence_text_parts.append(l3_text)
-                        has_word = True
-                    sentence_text = " ".join(sentence_text_parts)
-                    paragraph_text_parts.append(sentence_text)
-                    sentence_node.value = TextFragment(
-                        identifier=l2_id,
-                        lines=[sentence_text],
-                        filtered_lines=[sentence_text],
+                for event, node in ET.iterparse(
+                    buf, events=("start", "end"), html=True
+                ):
+                    node_id = node.attrib.get("id")
+                    if not node_id:
+                        if event == "end":
+                            node.clear()
+                        continue
+
+                    if event == "start":
+                        if l1_id is None and l1_re.match(node_id):
+                            l1_id = node_id
+                            continue
+                        elif (
+                            l1_id is not None and l2_id is None and l2_re.match(node_id)
+                        ):
+                            l2_id = node_id
+                            continue
+                        elif (
+                            l1_id is not None
+                            and l2_id is not None
+                            and l3_id is None
+                            and l3_re.match(node_id)
+                        ):
+                            l3_id = node_id
+                            continue
+                    elif event == "end":
+                        if node_id == l1_id:
+                            if l2_id is not None:
+                                raise AssertionError("Expected l2_id to be empty")
+                            if l3_id is not None:
+                                raise AssertionError("Expected l3_id to be empty")
+                            l1_id = None
+                        elif node_id == l2_id:
+                            if l1_id is None:
+                                raise AssertionError("Expected l1_id to not be empty")
+                            if l3_id is not None:
+                                raise AssertionError("Expected l3_id to be empty")
+                            l2_id = None
+                        elif node_id == l3_id:
+                            if l1_id is None:
+                                raise AssertionError("Expected l1_id to not be empty")
+                            if l2_id is None:
+                                raise AssertionError("Expected l2_id to not be empty")
+
+                            yield l1_id, l2_id, l3_id, node.text
+                            l3_id = None
+
+                    node.clear()
+            except ET.XMLSyntaxError as e:
+                # FIXME: This is a very ugly workaround for empty XML file, find a better way.
+                if e.msg == "no element found":
+                    return
+
+                raise
+
+        tree = Tree()
+        for l1_id, l1_items in itertools.groupby(parse(), key=operator.itemgetter(0)):
+            logger.debug("Found L1 node with ID: %r", l1_id)
+            paragraph_node = Tree()
+            paragraph_text_parts = []
+            for l2_id, l2_items in itertools.groupby(
+                l1_items, key=operator.itemgetter(1)
+            ):
+                logger.debug("Found L2 node with ID: %r", l2_id)
+                sentence_node = Tree()
+                paragraph_node.add_child(sentence_node)
+                sentence_text_parts = []
+                for _, _, l3_id, l3_text in l2_items:
+                    logger.debug(
+                        "Found L3 node with ID: %r and text: %r", l3_id, l3_text
                     )
-                    logger.debug("Found L2 node with text: %r", sentence_text)
-                if has_word:
-                    paragraph_text = " ".join(paragraph_text_parts)
-                    paragraph_node.value = TextFragment(
-                        identifier=l1_id,
-                        lines=[paragraph_text],
-                        filtered_lines=[paragraph_text],
+                    word_fragment = TextFragment(
+                        identifier=l3_id, lines=[l3_text], filtered_lines=[l3_text]
                     )
-                    tree.add_child(paragraph_node)
-                    logger.debug("Found L1 node with text: %r", paragraph_text)
-                else:
-                    logger.debug("Found L1 node but it has no words, skipping")
-            except KeyError as exc:
-                logger.warning("KeyError (%s) while parsing a L1 node", exc)
+                    word_node = Tree(value=word_fragment)
+                    sentence_node.add_child(word_node)
+                    sentence_text_parts.append(l3_text)
+                sentence_text = " ".join(sentence_text_parts)
+                paragraph_text_parts.append(sentence_text)
+                sentence_node.value = TextFragment(
+                    identifier=l2_id,
+                    lines=[sentence_text],
+                    filtered_lines=[sentence_text],
+                )
+                logger.debug("Found L2 node with text: %r", sentence_text)
+
+            paragraph_text = " ".join(paragraph_text_parts)
+            paragraph_node.value = TextFragment(
+                identifier=l1_id,
+                lines=[paragraph_text],
+                filtered_lines=[paragraph_text],
+            )
+            tree.add_child(paragraph_node)
+            logger.debug("Found L1 node with text: %r", paragraph_text)
 
         return tree
 
@@ -879,8 +917,8 @@ def _read_plain(cls, buf: typing.IO[bytes], parameters: dict) -> Tree:
     def _get_node_text(node, *, read_img_alt: bool) -> str:
         if node.text:
             return node.text
-        elif read_img_alt and node.name == "img":
-            alt = node.attrs.get("alt")
+        elif read_img_alt and node.tag == "img":
+            alt = node.attrib.get("alt")
             if alt is not None:
                 return alt
 
@@ -897,29 +935,32 @@ def _read_unparsed(
         :param bool read_img_alt: if True, read text from `<img/>` tag `alt` attribute
         """
 
-        def make_soup_strainer() -> bs4.SoupStrainer:
-            return bs4.SoupStrainer(
-                id=re.compile(
-                    rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX]}\b.*"
-                )
-            )
-
         # TODO better and/or parametric parsing,
         #      for example, removing tags but keeping text, etc.
         logger.debug("Parsing fragments from unparsed text format")
 
-        # transform text in a soup object
-        soup = get_soup(buf, parse_only=make_soup_strainer())
-
-        # extract according to id_regex
         text_from_id = {}
         ids = []
-        for node in soup.find_all():
-            node_id = node["id"]
-            node_text = cls._get_node_text(node, read_img_alt=read_img_alt)
+        id_regex = re.compile(
+            rf".*\b{parameters[gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX]}\b.*"
+        )
+
+        try:
+            for _, node in ET.iterparse(buf, events=("end",), html=True):
+                node_id = node.attrib.get("id")
+                if node_id and id_regex.match(node_id):
+                    text_from_id[node_id] = cls._get_node_text(
+                        node, read_img_alt=read_img_alt
+                    )
+                    ids.append(node_id)
+
+                node.clear()
+        except ET.XMLSyntaxError as e:
+            # FIXME: This is very ugly, find a better way.
+            if e.msg == "no element found":
+                return Tree()
 
-            text_from_id[node_id] = node_text
-            ids.append(node_id)
+            raise
 
         # sort by ID as requested
         id_sort = gf.safe_get(
diff --git a/aeneas_check_setup.py b/aeneas_check_setup.py
@@ -97,9 +97,9 @@ def check_import():
             "    A. you did not download/git-clone the aeneas package properly; or"
         )
         print_info("    B. you did not install the required Python packages:")
-        print_info("      1. BeautifulSoup4")
-        print_info("      2. lxml")
-        print_info("      3. numpy")
+        print_info("      1. lxml")
+        print_info("      2. numpy")
+        print_info("      3. Jinja2")
     except Exception as e:
         print_error(e)
     return True
diff --git a/bin/aeneas_check_setup.py b/bin/aeneas_check_setup.py
@@ -97,9 +97,9 @@ def check_import():
             "    A. you did not download/git-clone the aeneas package properly; or"
         )
         print_info("    B. you did not install the required Python packages:")
-        print_info("      1. BeautifulSoup4")
-        print_info("      2. lxml")
-        print_info("      3. numpy")
+        print_info("      1. lxml")
+        print_info("      2. numpy")
+        print_info("      3. Jinja2")
     except Exception as e:
         print_error(e)
     return True
diff --git a/check_dependencies.py b/check_dependencies.py
@@ -97,9 +97,9 @@ def check_import():
             "    A. you did not download/git-clone the aeneas package properly; or"
         )
         print_info("    B. you did not install the required Python packages:")
-        print_info("      1. BeautifulSoup4")
-        print_info("      2. lxml")
-        print_info("      3. numpy")
+        print_info("      1. lxml")
+        print_info("      2. numpy")
+        print_info("      3. Jinja2")
     except Exception as e:
         print_error(e)
     return True
diff --git a/debian/control b/debian/control
@@ -6,10 +6,10 @@ Build-Depends: debhelper (>= 9.0.0),
                dh-python,
                python-all-dev,
                python-setuptools,
-               python-numpy, python-lxml, python-bs4,
+               python-numpy, python-lxml,
                python3-all-dev,
                python3-setuptools,
-               python3-numpy, python3-lxml, python3-bs4,
+               python3-numpy, python3-lxml,
                libasound2-dev, libsndfile1-dev, libespeak-dev
 Standards-Version: 4.1.4
 Homepage: https://github.com/readbeyond/aeneas
diff --git a/docs/source/libtutorial.rst b/docs/source/libtutorial.rst
@@ -127,12 +127,12 @@ Dependencies
 
 * ``numpy`` (v1.9 or later)
 * ``lxml`` (v3.6.0 or later)
-* ``BeautifulSoup`` (v4.5.1 or later)
+* ``Jinja2`` (v3.0.0 or later)
 
 Only ``numpy`` is actually needed, as it is heavily used for the alignment computation.
 
-The other two dependencies (``lxml`` and ``BeautifulSoup``) are needed
-only if you use XML-like input or output formats.
+``lxml`` is needed only if you use XML-like input or output formats.
+``Jinja2`` is needed if you want to output finetuneas HTML.
 However, since they are popular Python packages, to avoid complex import testing,
 they are listed as requirements.
 This choice might change in the future.
diff --git a/requirements.txt b/requirements.txt
@@ -1,4 +1,3 @@
-BeautifulSoup4>=4.5.1
 lxml>=3.6.0
 numpy>=1.9
 Jinja2>=3.0.0
diff --git a/setupmeta.py b/setupmeta.py
@@ -49,7 +49,7 @@
 # required packages to install
 # NOTE: always use exact version numbers
 # NOTE: this list should be the same as requirements.txt
-PKG_INSTALL_REQUIRES = ["BeautifulSoup4>=4.5.1", "lxml>=3.6.0", "numpy>=1.9"]
+PKG_INSTALL_REQUIRES = ["lxml>=3.6.0", "numpy>=1.9", "Jinja2>=3.0.0"]
 
 # packages to be distributed
 # NOTE: not including the aeneas.test package to keep the size small
diff --git a/tests/test_sd.py b/tests/test_sd.py
@@ -33,10 +33,7 @@ class TestSD(BaseCase):
     def load(self):
         audio_file_mfcc = AudioFileMFCC(self.file_path(self.AUDIO_FILE))
         with open(self.file_path(self.TEXT_FILE), mode="rb") as text_f:
-            text_file = TextFile.load(
-                text_f,
-                file_format=TextFileFormat.PLAIN
-            )
+            text_file = TextFile.load(text_f, file_format=TextFileFormat.PLAIN)
         text_file.set_language(Language.ENG)
         return SD(audio_file_mfcc, text_file)
 
diff --git a/tests/test_task.py b/tests/test_task.py
diff --git a/tests/test_textfile.py b/tests/test_textfile.py
diff --git a/venvs/manage_venvs.sh b/venvs/manage_venvs.sh
diff --git a/wiki/INSTALL.md b/wiki/INSTALL.md