-
Notifications
You must be signed in to change notification settings - Fork 0
/
transform_ms_normalized.py
845 lines (832 loc) · 37.2 KB
/
transform_ms_normalized.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
# This script transforms xml documents into html for the website.
# Used for text type "ms" (manuscripts/transcription) as normalized,
# i.e. without visible manuscript additions/deletions, just the end
# result of the author's changes, such as all text tagged <del> gone.
import re
import os
from bs4 import BeautifulSoup
import copy
SOURCE_FOLDER = "documents/xml"
OUTPUT_FOLDER = "documents/html"
# the script version used on the website gets the text language
# value from the site, in this version we set it here
# it only affects the @lang value for the top div
# and the heading for the list of footnotes
LANGUAGE = "fi"
# loop through xml source files in folder and append to list
def get_source_file_paths():
file_list = []
for filename in os.listdir(SOURCE_FOLDER):
if filename.endswith(".xml"):
file_list.append(filename)
return file_list
# read an xml file and return its content as a soup object
def read_xml(filename):
with open(SOURCE_FOLDER + "/" + filename, "r", encoding="utf-8-sig") as source_file:
file_content = source_file.read()
# check for hyphens + line breaks
# if they are present, replace them
# before the file's content is made into a
# BeautifulSoup object
# the (¬|) below checks for either a not sign or
# an (invisible) soft hyphen
# there may also be <hi> tags involved
search_string = re.compile(r"(¬|)(</hi>)?<lb/>")
match_string = re.search(search_string, file_content)
if match_string:
file_content = replace_hyphens(file_content)
xml_soup = BeautifulSoup(file_content, "xml")
print("We have old soup.")
return xml_soup
# in the transcriptions for the manuscript/transcription column,
# each line of text is equivalent to the original manuscript's line,
# including its possible hyphens
# in the transcriptions, either hyphen minus or soft hyphen has been
# used as the kind of hyphen which is to disappear in the reading text,
# and the ¬ (not sign) has been used for a hyphen which is never to disappear
# let's make all hyphens uniform, and visible, by using only hyphen minus
# the (¬|) below checks for either a not sign or an (invisible) soft hyphen
# there may also be <hi> tags involved
def replace_hyphens(file_content):
search_string = re.compile(r"(¬|)(</hi>)?<lb/>")
file_content = search_string.sub(r"-\2<lb/>", file_content)
return file_content
def create_html_template():
html_doc = '''
<!DOCTYPE html>
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
<title></title>
</head>
<body></body>
</html>
'''
html_soup = BeautifulSoup(html_doc, "lxml")
return html_soup
def create_html_file(xml_soup):
html_soup = create_html_template()
# transfer title from xml header to html head
html_soup.head.title.string = xml_soup.teiHeader.title.get_text()
# transfer xml body to html body, and get rid of xml body tag
xml_body = xml_soup.body
html_soup.body.append(xml_body)
html_soup.body.body.unwrap()
return html_soup
# go through the xml elements, attributes and values
# and transform them as needed
def transform_tags(html_soup):
# transform <p>
elements = html_soup.find_all("p")
if len(elements) > 0:
for element in elements:
if "rend" in element.attrs:
rend_value = element["rend"]
element["class"] = rend_value
del element["rend"]
# type values: subtitle, motto
if "type" in element.attrs:
type_value = element["type"]
element["class"] = [type_value]
# "tei" and "teiManuscript" are required class values
# in manuscripts/transcriptions for these p:s
# otherwise some inherent styles won't work
element["class"].append("tei")
element["class"].append("teiManuscript")
if type_value == "subtitle":
# as specified in Digital Publishing WAI-ARIA Module 1.1
element["role"] = "doc-subtitle"
del element["type"]
# transform <lb/>
# in the transcriptions for the manuscript/transcription column, each line
# of text is equivalent to the original manuscript's line
# and the lines within a <p> ends with <lb/>, apart from
# the last line in the paragraph
elements = html_soup.find_all("lb")
if len(elements) > 0:
for element in elements:
# @break="yes" is for preserving a line break
# in the reading text, unnecessary in transcriptions
if "break" in element.attrs:
del element["break"]
# line breaks should only occur within other elements containing
# the document's text, and not on their own directly in the main <div>
if element.parent.name == "div":
element.decompose()
else:
element.name = "br"
# transform <pb/>
elements = html_soup.find_all("pb")
if len(elements) > 0:
for element in elements:
element.name = "span"
# insert the page number
if "n" in element.attrs:
n_value = element["n"]
element.insert(0, "|" + n_value + "|")
del element["n"]
if "type" in element.attrs:
type_value = element["type"]
if type_value == "orig":
element["class"] = "pb_orig"
del element["type"]
# if there's no @type, use this class
else:
element["class"] = "pb_orig"
# transform <lg> (poem stanza)
elements = html_soup.find_all("lg")
if len(elements) > 0:
for element in elements:
element.name = "p"
element["class"] = "lg"
# transform <l> (poem line): each <l> will be a span
elements = html_soup.find_all("l")
if len(elements) > 0:
for element in elements:
element.name = "span"
# treat class value as list item in case you have to
# append more classes
element["class"] = ["l"]
# add rend value as another class
if "rend" in element.attrs:
element["class"].append(element.get("rend"))
del element["rend"]
# insert line break after line span
line_break = html_soup.new_tag("br")
element.insert_after(line_break)
# transform <head>
# the platform provides <h1> and <h2> for each collection text page
# (i.e. each publication, or text with different adherent translations/transcriptions)
# <h1> contains the title of the text, automatically fetched from toc
# each column is an <article> with the column type (e.g. Transcription) as <h2>
# therefore the hierarchy of a text of type "ms" should always start with <h3>
elements = html_soup.find_all("head")
if len(elements) > 0:
for element in elements:
if "type" in element.attrs:
type_value = element["type"]
if type_value == "title":
element.name = "h3"
element["class"] = ["title"]
if type_value == "section":
element.name = "h4"
element["class"] = ["section"]
if type_value == "subchapter":
element.name = "h5"
element["class"] = ["sub"]
if type_value == "subchapter2":
element.name = "h6"
element["class"] = ["sub2"]
if type_value == "subchapter3":
element.name = "h6"
element["class"] = ["sub3"]
element["class"].append("tei")
element["class"].append("teiManuscript")
del element["type"]
# table headers should be <caption>
elif element.parent.name == "table":
element.name = "caption"
# in the xml files, the list header is placed inside
# <list>; in html we need the header outside
# of the list element
# but not as part of the <h> hierarchy
elif element.parent.name == "list":
new_header = html_soup.new_tag("p")
new_header["class"] = ["list_header"]
new_header["class"].append("tei")
new_header["class"].append("teiManuscript")
element.parent.insert_before(new_header)
list_header = element.extract()
new_header.insert(0, list_header)
list_header = element.unwrap()
# don't transform html tag <head>, just the xml <head>
elif element.parent.name == "html":
continue
# <head> without attribute: chapter heading
else:
element.name = "h4"
element["class"] = ["chapter"]
element["class"].append("tei")
element["class"].append("teiManuscript")
# transform <cell> (in <row> in <table>)
# also transform cells in a row with @role="label"
elements = html_soup.find_all("cell")
if len(elements) > 0:
for element in elements:
# <row role="label"> means its cells are to be <th>, not <td>
if element.parent.name == "row" and "role" in element.parent.attrs:
element.name = "th"
element["scope"] = "col"
else:
element.name = "td"
if "rend" in element.attrs:
element["class"] = "right"
del element["rend"]
# transform <row> (in <table>)
elements = html_soup.find_all("row")
if len(elements) > 0:
for element in elements:
if "role" in element.attrs:
del element["role"]
element.name = "tr"
# transform <list>
elements = html_soup.find_all("list")
if len(elements) > 0:
for element in elements:
element.name = "ul"
# transform <item>
elements = html_soup.find_all("item")
if len(elements) > 0:
for element in elements:
element.name = "li"
# transform <hi>
elements = html_soup.find_all("hi")
if len(elements) > 0:
for element in elements:
if "rend" in element.attrs:
if element["rend"] == "raised":
element.name = "sup"
elif element["rend"] == "sub":
element.name = "sub"
else:
element["class"] = element["rend"]
element.name = "em"
del element["rend"]
else:
element.name = "i"
# transform <milestone>
elements = html_soup.find_all("milestone")
if len(elements) > 0:
for element in elements:
element.name = "hr"
if element["type"] == "editorial":
element["class"] = "space"
if element["type"] == "bar":
element["class"] = "milestoneBar"
del element["type"]
# transform <anchor>
elements = html_soup.find_all("anchor")
if len(elements) > 0:
for element in elements:
element.name = "a"
if "id" in element.attrs:
id_value = element["id"]
element["name"] = id_value
element["class"] = ["anchor"]
element["class"].append(id_value)
del element["id"]
# transform <choice>
elements = html_soup.find_all("choice")
if len(elements) > 0:
for element in elements:
element.name = "span"
element["class"] = ["tooltiptrigger"]
for child in element.children:
if child.name == "orig" or child.name == "reg":
element.unwrap()
break
# transform child <expan> as part of the
# <choice>-transformation
# if <expan> is empty: unwrap both <expan>
# and <choice>
if child.name == "expan":
expan_contents = child.get_text()
if len(expan_contents) > 0:
element["class"].append("ttAbbreviations")
element["class"].append("abbr")
expan_span = child
expan_span.name = "span"
expan_span["class"] = ["tooltip"]
expan_span["class"].append("ttAbbreviations")
element.insert_after(expan_span)
else:
child.unwrap()
element.unwrap()
# transform <orig>
elements = html_soup.find_all("orig")
if len(elements) > 0:
for element in elements:
element.unwrap()
# transform <reg>
elements = html_soup.find_all("reg")
if len(elements) > 0:
for element in elements:
element.decompose()
# transform <abbr>
elements = html_soup.find_all("abbr")
if len(elements) > 0:
for element in elements:
# if parent <choice> has been transformed to this,
# then <abbr> should be transformed too
if element.parent.name == "span" and element.parent.attrs == {"class": ["tooltiptrigger", "ttAbbreviations", "abbr"]}:
element.name = "span"
element["class"] = "abbr"
# if parent <choice> has been transformed to this,
# then there was no sibling <expan> to <abbr>
# and there's no meaning to keep the <choice> tooltiptrigger
# and no use for <abbr>, only for its contents
elif element.parent.name == "span" and element.parent.attrs == {"class": ["tooltiptrigger"]}:
element.parent.unwrap()
element.unwrap()
# if there's no transformed parent <choice>,
# then <abbr>'s sibling <expan> had no content
# and <choice> has already been unwrapped as part
# of that transformation
# no use for <abbr>, only for its contents
else:
element.unwrap()
# transform <foreign>
elements = html_soup.find_all("foreign")
if len(elements) > 0:
for element in elements:
element.unwrap()
# transform <persName>
elements = html_soup.find_all("persName")
if len(elements) > 0:
for element in elements:
if "corresp" in element.attrs:
corresp_value = element.get("corresp")
if corresp_value.isdigit():
element["data-id"] = corresp_value
element.name = "span"
element["class"] = ["person"]
element["class"].append("tooltiptrigger")
element["class"].append("ttPerson")
del element["corresp"]
else:
element.unwrap()
else:
element.unwrap()
# transform <supplied>, add describing tooltip
elements = html_soup.find_all("supplied")
if len(elements) > 0:
for element in elements:
if "resp" in element.attrs:
del element["resp"]
if "type" in element.attrs:
# supplied with @type="gap" is used when the editor
# can guess what it said, and wants the ms to have
# a gap and the reading text to contain the guess
# thus, this should be shown just as gap in an ms
if element["type"] == "gap":
element.name = "span"
element.clear()
element["class"] = ["gap"]
element["class"].append("tooltiptrigger")
element["class"].append("ttMs")
explanatory_span = html_soup.new_tag("span")
explanatory_span["class"] = ["tooltip"]
explanatory_span["class"].append("ttMs")
# insert explanatory text in tooltip span
explanatory_span.insert(0, "oläsligt")
element.insert(0, "[...]")
element.insert_after(explanatory_span)
# supplied with @type="editorial" is used (inside a <head>)
# when the editor wants to add a heading for a text that is
# missing the highest level of heading (the title describing the
# whole text); this kind of supplied is shown in the ms,
# because it's good practice for the html (otherwise there would be
# a gap in the h-hierarchy, if the highest heading is missing
# and the levels used are lower ones, like "chapter")
if element["type"] == "editorial":
element.name = "span"
element["class"] = ["choice"]
element["class"].append("tooltiptrigger")
element["class"].append("ttChanges")
element["class"].append("editorial")
element["class"].append("tei")
element["class"].append("teiManuscript")
explanatory_span = html_soup.new_tag("span")
explanatory_span["class"] = ["tooltip"]
explanatory_span["class"].append("ttChanges")
# insert explanatory text in tooltip span
explanatory_span.insert(0, "tillagt av utgivaren")
element.insert_after(explanatory_span)
del element["type"]
# normal supplied should not be present in ms
# since it contains an editor's additions to the text
else:
element.decompose()
# transform <xref>
elements = html_soup.find_all("xref")
if len(elements) > 0:
for element in elements:
# the type attribute is required, and either id or target
# depending on the type of link
if "type" in element.attrs and ("id" in element.attrs or "target" in element.attrs):
xref_type = element.get("type")
# we need a valid type value
if xref_type == "":
element.unwrap()
continue
element.name = "a"
element["class"] = ["xreference"]
# link to other texts on the site
if (xref_type == "introduction" or xref_type == "readingtext") and "id" in element.attrs:
# id means a link to another text on the site
# in XML given as collection_id + "_" + publication_id
# (+ possibly "_" and a pos-value)
xref_id = element.get("id")
# we need an id value
if xref_id == "":
element.unwrap()
continue
else:
xref_id = xref_id.replace("_", " ")
element["href"] = xref_id
del element["id"]
if xref_type == "introduction":
element["class"].append("ref_introduction")
if xref_type == "readingtext":
element["class"].append("ref_readingtext")
del element["type"]
# link to external site
if xref_type == "ext" and "target" in element.attrs:
xref_target = element.get("target")
# we need an url
if xref_target == "":
element.unwrap()
continue
element["class"].append("ref_external")
del element["type"]
element["href"] = xref_target
del element["target"]
# in case the type was paired with the wrong attribute (id/target)
# e.g. type="readingtext" and target=""
if not "href" in element.attrs:
element.unwrap()
else:
element.unwrap()
# transform <address>
elements = html_soup.find_all("address")
if len(elements) > 0:
for element in elements:
element.name = "p"
element["class"] = "address"
# transform <dateline>
elements = html_soup.find_all("dateline")
if len(elements) > 0:
for element in elements:
element.name = "p"
element["class"] = "dateline"
# transform <salute>
elements = html_soup.find_all("salute")
if len(elements) > 0:
for element in elements:
element.name = "p"
element["class"] = "salute"
# transform <signed>
elements = html_soup.find_all("signed")
if len(elements) > 0:
for element in elements:
element.name = "p"
element["class"] = "signed"
# transform <add>
elements = html_soup.find_all("add")
if len(elements) > 0:
for element in elements:
element.unwrap()
# transform <del>
# the tag and its contents shouldn't be present
# in the normalized manuscript/transcription view
elements = html_soup.find_all("del")
if len(elements) > 0:
for element in elements:
element.decompose()
# transform <gap>, add describing tooltip
elements = html_soup.find_all("gap")
if len(elements) > 0:
for element in elements:
# @reason="overstrike" equals <del> in normalized view
if "reason" in element.attrs:
element.decompose()
else:
element.name = "span"
element["class"] = ["gap"]
element["class"].append("tooltiptrigger")
element["class"].append("ttMs")
explanatory_span = html_soup.new_tag("span")
explanatory_span["class"] = ["tooltip"]
explanatory_span["class"].append("ttMs")
# insert explanatory text in tooltip span
explanatory_span.insert(0, "oläsligt")
element.insert(0, "[...]")
element.insert(1, explanatory_span)
# transform <unclear>, add describing tooltip
elements = html_soup.find_all("unclear")
if len(elements) > 0:
for element in elements:
element.name = "span"
element["class"] = ["unclear"]
element["class"].append("tooltiptrigger")
element["class"].append("ttMs")
# insert explanatory text in tooltip span
explanatory_span = html_soup.new_tag("span")
explanatory_span["class"] = ["tooltip"]
explanatory_span["class"].append("ttMs")
explanatory_span.insert(0, "svårtytt")
element.insert_after(explanatory_span)
# transform <div>
# also handle footnotes <note> for each <div>
# first find the top <div> and add this text's language value to it
element = html_soup.find("div")
if "type" in element.attrs:
element["lang"] = LANGUAGE
elements = html_soup.find_all("div")
if len(elements) > 0:
for element in elements:
if "type" in element.attrs:
div_type_value = element["type"]
if div_type_value == "chapter" or div_type_value == "section":
element.name = "section"
else:
element["class"] = [div_type_value]
element["class"].append("tei")
element["class"].append("teiManuscript")
del element["type"]
# these are subgroups to the hansard div
# for the transformation of <p> we need
# the top <div> value
if div_type_value == "LM_written" or div_type_value == "LM_discussion" or div_type_value == "written" or div_type_value == "discussion":
div_type_value = "hansard"
# transform footnotes separately for each <div>
# so that we can have different footnote lists
# one list per <div>
# if there's just one <div>, and it has content:
# transform all of its notes
if len(elements) == 1 and len(element.contents) > 1:
notes = html_soup.find_all("note")
if len(notes) > 0:
transform_footnotes(notes, html_soup)
# if there's more than one <div>, and the <div>
# we're looking at right now has content:
# transform the notes of its (possible) subdivs separately
elif len(elements) > 1 and len(element.contents) > 1:
for child in element.children:
if child.name == "div" and "type" in child.attrs:
notes = child.find_all("note")
if len(notes) > 0:
transform_footnotes(notes, html_soup)
# if there are notes both to the top div and to
# a subdiv, this fixes the notes for the top div
notes = html_soup.find_all("note")
if len(notes) > 0:
transform_footnotes(notes, html_soup)
# files that only contain a template with an empty
# div should get their own div class
# this empty div will later on get transformed to an empty string
elif len(element.get_text(strip = True)) == 0:
div_type_value = "empty"
element["class"] = div_type_value
else:
transform_footnotes(notes, html_soup)
# <div> should always have @type, otherwise I have
# no idea what it stands for and can't do anything
# with it
else:
element.unwrap()
# transform <note> if it's not a footnote but is used for
# editors' explanations
# footnotes were already transformed in
# function transform_footnotes
editorial_notes = html_soup.find_all("note")
if len(elements) > 0:
for editorial_note in editorial_notes:
# editors' notes have no attributes
# do not show editors' notes in the manuscript/transcription column
if editorial_note.attrs == {}:
editorial_note.decompose()
# transform <opener>
elements = html_soup.find_all("opener")
if len(elements) > 0:
for element in elements:
element.name = "div"
element["class"] = ["opener"]
element["class"].append("tei")
element["class"].append("teiManuscript")
# transform <closer>
elements = html_soup.find_all("closer")
if len(elements) > 0:
for element in elements:
element.name = "div"
element["class"] = ["closer"]
element["class"].append("tei")
element["class"].append("teiManuscript")
# transform <postscript>
elements = html_soup.find_all("postscript")
if len(elements) > 0:
for element in elements:
element.name = "div"
element["class"] = ["postscript"]
element["class"].append("tei")
element["class"].append("teiManuscript")
# transform <table> by wrapping it in a specific <div>
# do this after the general div transformation in order to
# avoid this div being transformed twice, since it's not an
# xml <div> but an html <div> added only for the purpose of
# being able to style tables with a vertical scrollbar
elements = html_soup.find_all("table")
if len(elements) > 0:
for element in elements:
new_div = html_soup.new_tag("div")
new_div["class"] = ["table-wrapper"]
new_div["class"].append("tei")
new_div["class"].append("teiManuscript")
element.wrap(new_div)
# files with no text content, consisting of just an empty <div>,
# should return an empty string
# this will produce a message on the site, explaining that
# there's no text to show
element = html_soup.find("div")
if len(element) > 0 and element["class"] == "empty":
html_string = ""
# if there's no <div> at all in the file, this file's content
# is not according to the rules for this project and should be ignored
elif len(element) == 0:
html_string = ""
else:
html_soup = prevent_empty_paragraphs(html_soup)
html_string = str(html_soup)
# make <a/> into <a></a> since it's not one of the
# self-closing tags in html
# the lxml parser and BS seem to make all empty elements
# self-closing, with the trailing slash
search_string = re.compile(r"(<a class.*?name.*?)/>")
html_string = search_string.sub(r"\1></a>", html_string)
# remove tabs
search_string = re.compile(r"\t")
html_string = search_string.sub("", html_string)
# remove lines consisting only of <br/> (and possibly whitespace)
search_string = re.compile(r"^ *(<br/>) *$", re.MULTILINE)
html_string = search_string.sub("", html_string)
# replace double/triple/etc. spaces
search_string = re.compile(r"\s{2,}")
html_string = search_string.sub(" ", html_string)
# remove space before punctuation marks
# situations like "word ," may happen when removing
# deletions from the text, and we need to tidy this up
search_string = re.compile(r"\s+(,|;|\.|:|\?|!)")
html_string = search_string.sub(r"\1", html_string)
# content of element p shouldn't start/end with space
search_string = re.compile(r"(<p.*?>) ?")
html_string = search_string.sub(r"\1", html_string)
search_string = re.compile(r" (</p>)")
html_string = search_string.sub(r"\1", html_string)
print("We have new soup.")
return html_string
def transform_footnotes(notes, html_soup):
# transform footnotes
# a footnote will be transformed twice;
# once for the tooltip and once for a list
# of footnotes at the end of each text div
# <note> tags other than footnotes are transformed
# directly in transform_tags
i = 0
for note in notes:
# sometimes editors forget to put @id in the <note>
# if it still has @n, it's got to be a footnote
# let's add the id so the transformation works
if "n" in note.attrs and "id" not in note.attrs:
id_value = i + 1
id_value = "ftn" + str(id_value)
note["id"] = id_value
if "id" in note.attrs and "n" in note.attrs:
# we need to keep a copy of the original <note>
# for the second transformation
original_note = copy.copy(note)
# this is the tooltip transformation, we need three new tags
note_id = note.get("id")
# on the website it's possible to have a Swedish and a
# Finnish text next to each other, but if their footnotes
# have identical id:s the tooltips won't work as intended,
# showing content in the wrong language if the user chooses
# a tooltip first in one text and then in the other
# therefore we need to change the id:s for notes in one of
# the languages
if LANGUAGE == "fi":
note_nr = re.findall(r"\d+", note_id)
if len(note_nr) > 0:
note_nr = int(note_nr[0])
note_nr += 500
note_id = "ftn" + str(note_nr)
note_symbol = note.get("n")
html_note = html_soup.new_tag("span")
html_note["class"] = ["footnoteindicator"]
html_note["class"].append("tooltiptrigger")
html_note["class"].append("ttFoot")
html_note["tabindex"] = ["0"]
html_note["data-id"] = note_id
html_note.insert(0, note_symbol)
note_outer_span = html_soup.new_tag("span")
note_outer_span["class"] = ["tooltip"]
note_outer_span["class"].append("ttFoot")
note_inner_span = html_soup.new_tag("span")
note_inner_span["class"] = "ttFixed"
note_inner_span["data-id"] = note_id
# we can't just use .get_text() for getting the note contents,
# because we need to preserve all the tags in the note text,
# such as <persName> or <xref>
# by replacing <note> with the new note_inner_span tag,
# we get the new span on its right place in the tree
# and can use it to get the other new tags in place
# but note_inner_span has no content until we add the old
# content back (old note tag + its content saved in note_content)
# since we used the old tag just to conveniently keep all note
# contents together, we finally have to unwrap note_content,
# getting rid of that old tag
note_content = note.replace_with(note_inner_span)
note_inner_span.insert_before(html_note)
note_inner_span.insert(0, note_content)
note_inner_span.wrap(note_outer_span)
note_content.unwrap()
# this is the footnote list transformation:
# <section><p></p><ol><li><p><a></a></p></li></ol></section>
# if this is the first note in this <div>:
# create the section and the list
if i == 0:
note_section = html_soup.new_tag("section")
note_section["role"] = "doc-endnotes"
for tag in html_note.parents:
if tag.name == "div":
tag.append("\n")
tag.append(note_section)
break
# choose a heading for the list of notes
# depending on language
note_heading = html_soup.new_tag("p")
if LANGUAGE == "sv":
note_heading.string = "Noter"
elif LANGUAGE == "fi":
note_heading.string = "Viitteet"
elif LANGUAGE == "fr" or LANGUAGE == "en":
note_heading.string = "Notes"
elif LANGUAGE == "de":
note_heading.string = "Noten"
else:
note_heading.string = "– – – – – – –"
note_heading["class"] = "noIndent"
note_section.append(note_heading)
note_section.append("\n")
note_list = html_soup.new_tag("ol")
note_list["class"] = "footnotesList"
note_section.append(note_list)
note_list.append("\n")
listed_note = html_soup.new_tag("li")
listed_note["data-id"] = note_id
listed_note["class"] = "footnoteItem"
note_list.append(listed_note)
original_note.name = "p"
original_note.attrs = {}
original_note["class"] = "noIndent"
note_reference = html_soup.new_tag("a")
note_reference["class"] = ["xreference"]
note_reference["class"].append("footnoteReference")
note_reference["href"] = "#" + note_id
note_reference["role"] = "doc-backlink"
note_reference.append(note_symbol)
original_note.insert(0, note_reference)
listed_note.append(original_note)
note_list.append("\n")
i += 1
# delete paragraphs and headings that have no content
def prevent_empty_paragraphs(html_soup):
# if the content of a verse line has been deleted
# remove that empty verse line span and its trailing <br/>
elements = html_soup.find_all(attrs={"class": "l"})
if len(elements) > 0:
for element in elements:
if len(element.contents) == 0:
if element.next_sibling and element.next_sibling.name == "br":
element.next_sibling.decompose()
element.decompose()
elements = html_soup.find_all("p")
if len(elements) > 0:
for element in elements:
if len(element.get_text(strip = True)) == 0:
element.decompose()
# if the content of a heading <head> has been deleted
# e.g. due to it having contained only <del>
# remove that empty heading
elements = html_soup.find_all(["h3", "h4", "h5", "h6"])
if len(elements) > 0:
for element in elements:
if len(element.get_text(strip = True)) == 0:
element.decompose()
return html_soup
# create and save the new html file in another folder
def write_string_to_file(html_string, filename):
if not os.path.exists(OUTPUT_FOLDER):
os.makedirs(OUTPUT_FOLDER)
html_filename = filename.replace(".xml", ".html")
output_file = open(os.path.join(OUTPUT_FOLDER, html_filename), 'w', encoding='utf8')
output_file.write(html_string)
output_file.close()
return html_filename
def main():
file_list = get_source_file_paths()
for file in file_list:
xml_soup = read_xml(file)
html_soup = create_html_file(xml_soup)
html_string = transform_tags(html_soup)
html_filename = write_string_to_file(html_string, file)
print(html_filename + " created.")
main()