33
33
import collections .abc
34
34
import contextlib
35
35
import io
36
+ import itertools
36
37
import logging
38
+ import operator
37
39
import re
38
40
import typing
39
41
40
- import bs4
42
+ import lxml . etree as ET
41
43
42
44
from aeneas .idsortingalgorithm import IDSortingAlgorithm
43
45
from aeneas .tree import Tree
48
50
logger = logging .getLogger (__name__ )
49
51
50
52
51
- def get_soup (buf : typing .IO [bytes ], * , parse_only = None ) -> bs4 .BeautifulSoup :
52
- return bs4 .BeautifulSoup (buf , "lxml" , parse_only = parse_only )
53
-
54
-
55
53
class TextFileFormat :
56
54
"""
57
55
Enumeration of the supported formats for text files.
@@ -737,74 +735,114 @@ def _read_munparsed(cls, buf: typing.IO[bytes], parameters: dict) -> Tree:
737
735
:param buf: the bytes file object
738
736
"""
739
737
740
- def nodes_at_level (root , level : int ):
741
- """Return a dict with the bs4 filter parameters"""
742
- LEVEL_TO_REGEX_MAP = [
743
- None ,
744
- gc .PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX ,
745
- gc .PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX ,
746
- gc .PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX ,
747
- ]
748
- attribute_name = "id"
749
- regex_string = parameters [LEVEL_TO_REGEX_MAP [level ]]
750
- logger .debug ("Regex for %s: %r" , attribute_name , regex_string )
751
- regex = re .compile (rf".*\b{ regex_string } \b.*" )
752
- return root .find_all (attrs = {attribute_name : regex })
738
+ def parse ():
739
+ l1_re = re .compile (
740
+ rf".*\b{ parameters [gc .PPN_TASK_IS_TEXT_MUNPARSED_L1_ID_REGEX ]} \b.*"
741
+ )
742
+ l2_re = re .compile (
743
+ rf".*\b{ parameters [gc .PPN_TASK_IS_TEXT_MUNPARSED_L2_ID_REGEX ]} \b.*"
744
+ )
745
+ l3_re = re .compile (
746
+ rf".*\b{ parameters [gc .PPN_TASK_IS_TEXT_MUNPARSED_L3_ID_REGEX ]} \b.*"
747
+ )
753
748
754
- # TODO better and/or parametric parsing,
755
- # for example, removing tags but keeping text, etc.
756
- logger .debug ("Parsing fragments from munparsed text format" )
757
- # transform text in a soup object
758
- soup = get_soup (buf )
759
- # extract according to id_regex
760
- logger .debug ("Finding L1 elements" )
761
- tree = Tree ()
762
- for l1_node in nodes_at_level (soup , 1 ):
763
- has_word = False
749
+ l1_id = l2_id = l3_id = None
764
750
try :
765
- l1_id = l1_node ["id" ]
766
- logger .debug ("Found L1 node with ID: %r" , l1_id )
767
- paragraph_node = Tree ()
768
- paragraph_text_parts = []
769
- for l2_node in nodes_at_level (l1_node , 2 ):
770
- l2_id = l2_node ["id" ]
771
- logger .debug ("Found L2 node with ID: %r" , l2_id )
772
- sentence_node = Tree ()
773
- paragraph_node .add_child (sentence_node )
774
- sentence_text_parts = []
775
- for l3_node in nodes_at_level (l2_node , 3 ):
776
- l3_id = l3_node ["id" ]
777
- l3_text = l3_node .text
778
- logger .debug ("Found L3 node with ID: %r" , l3_id )
779
- logger .debug ("Found L3 node with text: %r" , l3_text )
780
- word_fragment = TextFragment (
781
- identifier = l3_id , lines = [l3_text ], filtered_lines = [l3_text ]
782
- )
783
- word_node = Tree (value = word_fragment )
784
- sentence_node .add_child (word_node )
785
- sentence_text_parts .append (l3_text )
786
- has_word = True
787
- sentence_text = " " .join (sentence_text_parts )
788
- paragraph_text_parts .append (sentence_text )
789
- sentence_node .value = TextFragment (
790
- identifier = l2_id ,
791
- lines = [sentence_text ],
792
- filtered_lines = [sentence_text ],
751
+ for event , node in ET .iterparse (
752
+ buf , events = ("start" , "end" ), html = True
753
+ ):
754
+ node_id = node .attrib .get ("id" )
755
+ if not node_id :
756
+ if event == "end" :
757
+ node .clear ()
758
+ continue
759
+
760
+ if event == "start" :
761
+ if l1_id is None and l1_re .match (node_id ):
762
+ l1_id = node_id
763
+ continue
764
+ elif (
765
+ l1_id is not None and l2_id is None and l2_re .match (node_id )
766
+ ):
767
+ l2_id = node_id
768
+ continue
769
+ elif (
770
+ l1_id is not None
771
+ and l2_id is not None
772
+ and l3_id is None
773
+ and l3_re .match (node_id )
774
+ ):
775
+ l3_id = node_id
776
+ continue
777
+ elif event == "end" :
778
+ if node_id == l1_id :
779
+ if l2_id is not None :
780
+ raise AssertionError ("Expected l2_id to be empty" )
781
+ if l3_id is not None :
782
+ raise AssertionError ("Expected l3_id to be empty" )
783
+ l1_id = None
784
+ elif node_id == l2_id :
785
+ if l1_id is None :
786
+ raise AssertionError ("Expected l1_id to not be empty" )
787
+ if l3_id is not None :
788
+ raise AssertionError ("Expected l3_id to be empty" )
789
+ l2_id = None
790
+ elif node_id == l3_id :
791
+ if l1_id is None :
792
+ raise AssertionError ("Expected l1_id to not be empty" )
793
+ if l2_id is None :
794
+ raise AssertionError ("Expected l2_id to not be empty" )
795
+
796
+ yield l1_id , l2_id , l3_id , node .text
797
+ l3_id = None
798
+
799
+ node .clear ()
800
+ except ET .XMLSyntaxError as e :
801
+ # FIXME: This is a very ugly workaround for empty XML file, find a better way.
802
+ if e .msg == "no element found" :
803
+ return
804
+
805
+ raise
806
+
807
+ tree = Tree ()
808
+ for l1_id , l1_items in itertools .groupby (parse (), key = operator .itemgetter (0 )):
809
+ logger .debug ("Found L1 node with ID: %r" , l1_id )
810
+ paragraph_node = Tree ()
811
+ paragraph_text_parts = []
812
+ for l2_id , l2_items in itertools .groupby (
813
+ l1_items , key = operator .itemgetter (1 )
814
+ ):
815
+ logger .debug ("Found L2 node with ID: %r" , l2_id )
816
+ sentence_node = Tree ()
817
+ paragraph_node .add_child (sentence_node )
818
+ sentence_text_parts = []
819
+ for _ , _ , l3_id , l3_text in l2_items :
820
+ logger .debug (
821
+ "Found L3 node with ID: %r and text: %r" , l3_id , l3_text
793
822
)
794
- logger .debug ("Found L2 node with text: %r" , sentence_text )
795
- if has_word :
796
- paragraph_text = " " .join (paragraph_text_parts )
797
- paragraph_node .value = TextFragment (
798
- identifier = l1_id ,
799
- lines = [paragraph_text ],
800
- filtered_lines = [paragraph_text ],
823
+ word_fragment = TextFragment (
824
+ identifier = l3_id , lines = [l3_text ], filtered_lines = [l3_text ]
801
825
)
802
- tree .add_child (paragraph_node )
803
- logger .debug ("Found L1 node with text: %r" , paragraph_text )
804
- else :
805
- logger .debug ("Found L1 node but it has no words, skipping" )
806
- except KeyError as exc :
807
- logger .warning ("KeyError (%s) while parsing a L1 node" , exc )
826
+ word_node = Tree (value = word_fragment )
827
+ sentence_node .add_child (word_node )
828
+ sentence_text_parts .append (l3_text )
829
+ sentence_text = " " .join (sentence_text_parts )
830
+ paragraph_text_parts .append (sentence_text )
831
+ sentence_node .value = TextFragment (
832
+ identifier = l2_id ,
833
+ lines = [sentence_text ],
834
+ filtered_lines = [sentence_text ],
835
+ )
836
+ logger .debug ("Found L2 node with text: %r" , sentence_text )
837
+
838
+ paragraph_text = " " .join (paragraph_text_parts )
839
+ paragraph_node .value = TextFragment (
840
+ identifier = l1_id ,
841
+ lines = [paragraph_text ],
842
+ filtered_lines = [paragraph_text ],
843
+ )
844
+ tree .add_child (paragraph_node )
845
+ logger .debug ("Found L1 node with text: %r" , paragraph_text )
808
846
809
847
return tree
810
848
@@ -879,8 +917,8 @@ def _read_plain(cls, buf: typing.IO[bytes], parameters: dict) -> Tree:
879
917
def _get_node_text (node , * , read_img_alt : bool ) -> str :
880
918
if node .text :
881
919
return node .text
882
- elif read_img_alt and node .name == "img" :
883
- alt = node .attrs .get ("alt" )
920
+ elif read_img_alt and node .tag == "img" :
921
+ alt = node .attrib .get ("alt" )
884
922
if alt is not None :
885
923
return alt
886
924
@@ -897,29 +935,32 @@ def _read_unparsed(
897
935
:param bool read_img_alt: if True, read text from `<img/>` tag `alt` attribute
898
936
"""
899
937
900
- def make_soup_strainer () -> bs4 .SoupStrainer :
901
- return bs4 .SoupStrainer (
902
- id = re .compile (
903
- rf".*\b{ parameters [gc .PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX ]} \b.*"
904
- )
905
- )
906
-
907
938
# TODO better and/or parametric parsing,
908
939
# for example, removing tags but keeping text, etc.
909
940
logger .debug ("Parsing fragments from unparsed text format" )
910
941
911
- # transform text in a soup object
912
- soup = get_soup (buf , parse_only = make_soup_strainer ())
913
-
914
- # extract according to id_regex
915
942
text_from_id = {}
916
943
ids = []
917
- for node in soup .find_all ():
918
- node_id = node ["id" ]
919
- node_text = cls ._get_node_text (node , read_img_alt = read_img_alt )
944
+ id_regex = re .compile (
945
+ rf".*\b{ parameters [gc .PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX ]} \b.*"
946
+ )
947
+
948
+ try :
949
+ for _ , node in ET .iterparse (buf , events = ("end" ,), html = True ):
950
+ node_id = node .attrib .get ("id" )
951
+ if node_id and id_regex .match (node_id ):
952
+ text_from_id [node_id ] = cls ._get_node_text (
953
+ node , read_img_alt = read_img_alt
954
+ )
955
+ ids .append (node_id )
956
+
957
+ node .clear ()
958
+ except ET .XMLSyntaxError as e :
959
+ # FIXME: This is very ugly, find a better way.
960
+ if e .msg == "no element found" :
961
+ return Tree ()
920
962
921
- text_from_id [node_id ] = node_text
922
- ids .append (node_id )
963
+ raise
923
964
924
965
# sort by ID as requested
925
966
id_sort = gf .safe_get (
0 commit comments