-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
50 lines (42 loc) · 2.44 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# -*- coding: utf-8 -*-
from structure_extraction.io import io
from structure_extraction.transform import simplify, sort, breakdown, paginate
from structure_extraction.utils import utils
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser(description="Transform XML files to raw text.")
parser.add_argument("-i", "--input", action="store", required=True, nargs=1, help="path to file to transform.")
parser.add_argument("-o", "--output", action="store", nargs=1,
help="desired path to resulting filename. Default : input filename + '_out.xml | _guard.xml.'")
args = parser.parse_args()
filename_in = args.input[0]
filename_out = args.output
# first we read the XML ABBY file:
transformed_text = io.make_the_soup(filename_in)
# !! add schema test before continuing - ABBY schema
if transformed_text:
# then we simplify the XML tree be sorting text and non-text blocks:
transformed_text = simplify.rearrange(transformed_text)
# !! add schema test before continuing -- homemade schema
# then we sort out headers and signatures, which may raise warnings:
transformed_text_guard, transformed_text, warning_headers, warning_signatures, warning_headers_corrected = sort.exclude_headers_signatures(transformed_text)
transformed_text = paginate.paginate(transformed_text)
# !! add schema test before continuing -- homemade schema
# then we separate the tree structure from the physical structure of the text:
transformed_text = breakdown.make_breakers(transformed_text)
# !! add schema test before continuing -- homemade schema
# raising warnings:
utils.report(warning_headers, "HEADER")
utils.report(warning_signatures, "SIGNATURE")
utils.report(warning_headers_corrected, "CORRECT_HEADER")
# creating output files content:
final_xml_str = io.make_string(transformed_text)
final_guard_str = io.make_string(transformed_text_guard)
# creating output files names and writing the output:
out_xml_file, out_guard, out_txt_file = io.make_out_filenames(filename_in, filename_out)
io.write_output(out_xml_file, final_xml_str)
io.write_output(out_guard, final_guard_str)
# make plain text output
# - recompose paragraphs
# - identify title
# - add management of location within the article from titles and headers