@@ -18,13 +18,13 @@ def _find_first_comicid(line, ln=None):
18
18
try :
19
19
20
20
# QUIRK: dataset has a spew of typos and odds, so the regex has to be complex
21
- result = re .findall (r'^(\s|)( (g[as]\w+|dr\w+|pg[a-zA-Z0-9_-]+|\w+)(\s|)(--|\.\.|- -|\*\*))' , line , flags = re .I )
21
+ result = re .findall (r'^((g[as]\w+|dr\w+|pg[a-zA-Z0-9_-]+|\w+)(\s|)(--|\.\.|- -|\*\*))' , line , flags = re .I )
22
22
if len (result ) <= 0 :
23
23
raise IndexError ("No match for regex" , result , line )
24
24
if len (result [0 ]) <= 0 :
25
25
raise IndexError ("No group in match 0 in regex (undefined behaviour)" , result [0 ], line )
26
26
27
- return result [0 ][ 1 :]
27
+ return result [0 ]
28
28
29
29
except Exception as e :
30
30
@@ -44,8 +44,10 @@ def cleanup(input_file, output):
44
44
_skip_ahead = _skip_ahead - 1
45
45
continue
46
46
47
+ line = lines [i ].strip ()
48
+
47
49
# find comicid (for merging lines together)
48
- comicid = _find_first_comicid (lines [ i ] , ln = i )
50
+ comicid = _find_first_comicid (line , ln = i )
49
51
50
52
_proc_line = ""
51
53
@@ -57,12 +59,13 @@ def cleanup(input_file, output):
57
59
58
60
try :
59
61
_sub_comicid = _find_first_comicid (_loop_line , ln = i + i2 )
60
- except : # line has no comicid header? TODO: that even happens???
61
- print ("WARNING: malfromed line, # %s :" % (i + i2 ))
62
- print (_loop_line )
63
- print ("Root line # %s :" % i )
64
- print (lines [i ].strip ())
65
- # _proc_line += " " + _loop_line
62
+ except : # line has no comicid header?
63
+ print ("WARNING: malfromed line #%s from %s" % (i + i2 , i ))
64
+ if _loop_line == "-" * len (_loop_line ) or _loop_line == "." * len (_loop_line ):
65
+ print ("skip" )
66
+ else :
67
+ pass
68
+ #_proc_line += " " + _loop_line
66
69
_skip_ahead += 1
67
70
continue
68
71
0 commit comments