-
Notifications
You must be signed in to change notification settings - Fork 0
/
parse.py
366 lines (317 loc) · 11.8 KB
/
parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
# parser.py - parse the Linux From Scratch book
import os
import re
import sys
import json
import datetime
from dateutil.parser import parse
from lxml import etree, html
from common import get_html_root
import common
from book import LFSBook, variables
from snippet import Snippet
from section import Section
from pkg import Pkg, wget_files, get_non_std_subdirs
from checksum import Checksum
#-------------------------------------------------------------------------------
# Title - version and date
#-------------------------------------------------------------------------------
def get_version(root):
nd = root.find(f'.//div[@class="book"]/div[@class="titlepage"]/div')
subs = nd.findall(f'.//div/h2[@class="subtitle"]')
ver = dt = None
for sub in subs:
s = sub.text.strip()
m = re.match('Version (.*)$', s)
if m:
ver = m.group(1)
else:
m = re.match('Published (.*)$', s)
if m:
dt = parse(m.group(1)).date()
return ver, dt
#-------------------------------------------------------------------------------
# Chapter 3 - packages and patches
#-------------------------------------------------------------------------------
def parse_pkg(dt, dd, has_ver=True):
# Term: element <span class="term">
k = dt[0]
s = k.text.replace('\n', '')
# packages have a version number, patches do not
pat = r'([^(]+)\(([^)]+)\)' if has_ver else r'(.*) -'
m = re.match(pat, s)
if m:
name = m.group(1).strip()
version = None
if has_ver:
version = m.group(2)
else:
print(f'Term: unable to parse "{s}"')
# Element <span class="token">
k = k[0]
s = k.text
m = re.match(r'([0-9,]+)', s)
if m:
size = int(m.group(1).replace(',', ''))
else:
print(f'Token: unable to parse "{s}"')
# Definition
for k in dd:
if k.tag == 'p':
s = k.text.strip()
if s.startswith('Download:'):
kk = k[0]
url = kk.attrib['href']
if s.startswith('MD5 sum:'):
kk = k[0]
md5 = Checksum('md5', kk.text)
return Pkg(name, size, url, md5, version)
def parse_pkg_list(root, id, has_ver=True):
nd = root.find(f'.//a[@id="{id}"]')
if nd is None:
# As of version 9.1
nd = root.find(f'.//a[@id="ch-{id}"]')
for i in range(5):
nd = nd.getparent()
# <div class="sect1" lang="en" xml:lang="en">
pkgs = []
for dt, dd in zip(nd.findall('.//div/div/dl/dt'),
nd.findall('.//div/div/dl/dd')):
p = parse_pkg(dt, dd, has_ver=has_ver)
pkgs.append(p)
return pkgs
#-------------------------------------------------------------------------------
# sect1 - run commands
#-------------------------------------------------------------------------------
def get_em_code(nd, request_mark=False):
s = ''
for k in nd:
if k.tag == 'em':
mark_it = False
if 'class' in k.attrib:
kl = k.attrib['class']
if kl == 'replaceable':
# Next sub-element of type <code> must mark the code to be
# replaced
mark_it = True
if k.text:
s += k.text
s += get_em_code(k, mark_it)
if k.tail:
s += k.tail
elif k.tag == 'code':
if k.text:
if request_mark:
s += f'<code>{k.text}</code>'
else:
s += k.text
s += get_em_code(k)
if k.tail:
s += k.tail
else:
print(f'get_em_code: unhandled element with tag <{k.tag}>')
return s
def get_kbds_script(kbds):
s = ''
for kbd in kbds:
# One or more <code>, or <em><code>
if kbd.text:
s += kbd.text
s += get_em_code(kbd)
return s
# HTML elements with class "sect1" rather than "wrap" correspond to series of
# instructions to be executed, that do not build a software package.
def parse_sect1(nd):
# Title has package name and version
title = ''
a = nd.find('.//div/div/div/h2/a')
if a is None or a.tail is None:
return
# Parse the section title
title = a.tail.strip()
title = re.sub('\s+', ' ', title)
# Extract build instructions
snippets = []
for pre in nd.findall('.//pre'):
s = ''
if pre.text:
t = pre.text.strip()
s += t
if 'class' in pre.attrib:
kl = pre.attrib['class']
# Note: <pre class="root" has only been found in sect1 sections
if kl in ['userinput', 'root']:
# This is a code snippet
kbds = pre.findall('.//kbd[@class="command"]')
s += get_kbds_script(kbds)
if len(s) > 0:
snippets.append(Snippet('userinput', s))
elif kl == 'screen':
# This is some program output
s += get_em_code(pre)
if len(s) > 0:
snippets.append(Snippet('screen', s))
# Only create section object if there are instructions
if len(snippets) > 0:
return Section(title, snippets, with_pkg=False)
#-------------------------------------------------------------------------------
# Chapters 5 and 6 - building system software
#-------------------------------------------------------------------------------
# This parses the paragraph following the section title which gives the
# estimated build time in SBUs and the approximate disk size.
def parse_package(nd):
x = nd.find('.//div[@class="segmentedlist"]/div[@class="seglistitem"]')
segs = x.findall('.//div[@class="seg"]')
# print(f'Found {len(segs)} segs')
sbu = 0.0
sz = 0.0
for k in segs:
# One seg for build time, another one for disk size
build_time = False
for kk in k:
if kk.tag == 'strong':
if kk.text == 'Approximate build time:':
build_time = True
elif kk.tag == 'span':
if build_time:
if kk.text == 'less than 0.1 SBU':
kk.text = '0.1 SBU'
m = re.match(r'(\d+(\.\d+)?) SBU', kk.text)
if m:
sbu = float(m.group(1))
# else:
# print(f'Can\'t parse "{kk.text}" as build time')
else:
# Disk space
m = re.match(r'(\d+(\.\d+)?) (G|M)B', kk.text)
if m:
sz = float(m.group(1))
# else:
# print(f'Can\'t parse "{kk.text}" as disk size')
return sbu, sz
def parse_wrap(nd):
"""This parses one section with class="wrap" (it builds a package) """
# Title has package name and version
title = ''
a = nd.find('.//div/div/div/h2/a')
if a is None or a.tail is None:
return
# Parse the section title
title = a.tail.strip()
title = re.sub('\s+', ' ', title)
# Patch
title = title.replace('xml::parser', 'xml-parser')
# Extract build instructions
snippets = []
divs = nd.findall('./div')
with_pkg = False
sbu = sz = None
for div in divs:
# A section may have different (sub-)classes
if 'class' in div.attrib:
kl = div.attrib['class']
if kl == 'package':
sbu, sz = parse_package(div)
# print(f'{title}, {sbu} SBU, {sz} GB')
with_pkg = True
continue
# sect2 is used for the 8.4 grub section
if kl not in ['installation', 'configuration', 'sect2']:
continue
for pre in div.findall('.//pre'):
s = ''
if pre.text:
t = pre.text.strip()
s += t
if 'class' in pre.attrib:
kl = pre.attrib['class']
# Note: <pre class="root" has only been found in sect1 sections
if kl in ['userinput', 'root']:
# This is a code snippet
kbds = pre.findall('.//kbd[@class="command"]')
s += get_kbds_script(kbds)
if len(s) > 0:
snippets.append(Snippet('userinput', s))
elif kl == 'screen':
# This is some program output
s += get_em_code(pre)
if len(s) > 0:
snippets.append(Snippet('screen', s))
# Only create section object if there are instructions
if len(snippets) > 0:
return Section(title, snippets, with_pkg=with_pkg, sbu=sbu, sz=sz)
def parse_all_chapters(root):
sections = []
for chap_div in root.findall('.//div[@class="chapter"]'):
a = chap_div.find(f'.//a')
id = a.attrib['id']
# Chapter title
title = ''
if a.tail:
title = a.tail.strip()
title = re.sub('\s+', ' ', title)
# <div class="chapter" lang="en" xml:lang="en">
for div in chap_div.findall('.//div'):
kl = ''
if 'class' in div.attrib:
kl = div.attrib['class']
sect = None
if kl == 'wrap':
sect = parse_wrap(div)
if kl == 'sect1':
sect = parse_sect1(div)
if sect is not None:
sections.append(sect)
return sections
#-------------------------------------------------------------------------------
# parse_book -
#-------------------------------------------------------------------------------
def parse_book(root):
"""Return an LFSBook instance from an LFS book HTML tree."""
version, pub_date = get_version(root)
# Get packages, checksums, subdir names, ...
pkgs_path = os.path.join(common.lfs_data, 'pkgs')
if not os.path.isdir(pkgs_path):
os.mkdir(pkgs_path)
wget_files(version, pkgs_path)
# Initialize book object
bk = LFSBook(common.lfs_data, version, pub_date)
# Get the packages data, including name, version and archive filename. In
# particular, this is where we find the linux kernel version.
bk.pkgs = parse_pkg_list(root, 'materials-packages')
# Set the values of dynamic variables
variables.update(dict(
pkg_repository=pkgs_path,
lfs_version=version,
kernel_version=bk.get_kernel_version(),
))
bk.apply_mapping()
# FIXME parsing functions should be in LFSBook
bk.sections = parse_all_chapters(root)
# Calculate expected number of SBUs
bk.cumulate_sbus()
# Used by set_archive_filenames
filepath = os.path.join(pkgs_path, 'subdir_names')
if not os.path.isfile(filepath):
get_non_std_subdirs(pkgs_path)
with open(filepath, 'r') as f:
bk.subdir_names = json.load(f)
# Determine each section's archive filename and set it here. This is needed
# because the package names in chapter 3, the archive filenames in chapter
# 3, and the package (section) names in chapter 5 don't always match.
bk.set_archive_filenames()
return bk
#===============================================================================
# main
#===============================================================================
if __name__ == '__main__':
# Check cmd line args
if len(sys.argv) != 2:
print(f'Usage: {sys.argv[0]} <version>')
exit(-1)
# Version of the LFS book to be used
version = sys.argv[1]
# Get the LFS book to use as an HTML tree, and parse it
bk = parse_book(get_html_root(version))
print(f'Found {len(bk.pkgs)} pkgs, {len(bk.sections)} sections')
bk.gen_code()