-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsplitting_of_pg22.py
131 lines (97 loc) · 3.53 KB
/
splitting_of_pg22.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
"""
Splitting of raw pg22.txt into formatted individual files
the original of the txt is at: http://www.gutenberg.org/cache/epub/22/pg22.txt
Regular expressions reference pages: https://docs.python.org/3/library/re.html?highlight=regular%20expressions
and HOWTO: https://docs.python.org/3/howto/regex.html#regex-howto
"""
"""
The individual articles have the following format:
#d...dl - #, then one or up to four digits, then an optional letter,
then multiple lines of text, then a blank line.
The hierarchy has 6 classes, numbered with Roman numerals and centered on a page with a blank line before and after:
CLASS I
WORDS EXPRESSING ABSTRACT RELATIONS
It has sections in every class numbered by Roman numerals with % separators before and after;
%
SECTION II. RELATION
...
%
Individual parts of a section are numbered with arabic numerals as follows:
%
2. BEING, IN THE CONCRETE yeah!
%
The first part of a section is included in the same %% pair of separators;
There are unnumbered 'parts of parts' that are in %% separators too but don't have a number and all belong to the same part.
The pages are marked as:
<-- p. 9 -->
These marks should probably be removed first.
"""
import re
# the % symbol in the beginning of the line
be = re.compile('%\n') # beginning or end of a multi-line
# title of class, section, part
hd = re.compile('(CLASS|SECTION|THESAURUS)')
tb = re.compile(' ') # starting tab of 5 spaces
me = re.compile('\[.+\]') # everything in brackets
it = re.compile('#\d+\.') # items with a '#', digits and '.'
# in front
charref_1 = re.compile(r"""
&[#] # Start of a numeric entity reference
(
0[0-7]+ # Octal form
| [0-9]+ # Decimal form
| x[0-9a-fA-F]+ # Hexadecimal form
)
; # Trailing semicolon
""", re.VERBOSE)
charref_2 = re.compile("&#(0[0-7]+"
"|[0-9]+"
"|x[0-9a-fA-F]+);")
cl = re.compile(r'\bCLASS\b') # class title
se = re.compile(r'\bSECTION\b') # section title
pt = re.compile('') # part
line: str = ''
lines: list = []
beg = True # this is the end of the multi-line
with open('pg22.txt', 'tr') as roget:
while True:
line = roget.readline()
if not line: break
if re.match(be, line): # the % delimiter
if re.search()
pass # do the (compound) header reading
elif re.search(it, line):
print(line) # the first line of an item
else:
pass # not header not item, who knows what it is.
"""
# the main cycle:
with open('pg22.txt', 'r') as roget:
while True:
line = roget.readline()
if not line: break # end of the main loop.
if re.match(be, line) and beg: # title of a class, section...
while True:
line = roget.readline()
if not line: break
if not re.match(be, line) and beg:
lines.append(line)
else:
print(lines)
lines = []
break # processing of a title ends here
beg = False
elif not beg: beg = True
if re.search(it, line):
print(line)
"""
# the symbol at the beginning of the line is detected best
# by re.match(), that's what should be used for % and <--
#
#
"""
import urllib
for line in urllib.request.urlopen('http://www.gutenberg.org/cache/epub/22/pg22.txt'):
pass
"""
print('ok')