-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathgenerate_html_guide.py
executable file
·352 lines (317 loc) · 14.6 KB
/
generate_html_guide.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
#!/usr/bin/env python
# coding=utf-8
#
## OxygenGuide - Offline travel guide
## http://code.google.com/p/oxygenguide
##
## Generate HTML files for articles from http://en.wikivoyage.org
## Input: Wikivoyage dump from http://dumps.wikimedia.org/enwikivoyage/
## Output: HTML for local browsing.
##
## Author: Nicolas Raoul http://nrw.free.fr
# Import useful libraries.
import os
import re
import sys
from xml.sax import make_parser
from xml.sax.handler import ContentHandler
from urllib import urlencode
## Settings
# Path to the input file:
databaseDump = sys.argv[1] #'enwikivoyage-20130101-pages-articles.xml'
print 'Using data from ' + databaseDump
outputDirectory = 'articles'
minimization = True
def urlencode_string(target):
return urlencode({'':target})[1:]
re_redirect = re.compile('#REDIRECT', re.I) # Regular expression to detect REDIRECT
def is_redirect(wikicode):
#print wikicode
#print bool(re_redirect.match(wikicode))
return re_redirect.match(wikicode)
# Some operating systems don't like 20000 files in the same directory, or filenames with exotic characters.
# This method builds a file path for this article that looks like '38/16720965.html'
# That means files will be distributed between 100 directories.
# Even though overall collision probability is 1/500k, a future enhancement could be to check for collisions.
def hashName(articleName):
hashvalue = '%d' % abs(hash(articleName))
directory = hashvalue[:2]
file = hashvalue[2:]
if not os.path.isdir('%s/%s' % (outputDirectory, directory)):
os.mkdir('%s/%s' % (outputDirectory, directory))
return directory + '/' + file + '.html'
# This class represents a Wikitravel article, parses it and processes its content.
class Article(object):
def __init__ (self, wikicode, articleName):
self.wikicode = wikicode
self.articleName = articleName
# Parse the wikicode and write this article as an HTML file.
def saveHTML(self):
print articleName
outputFile = open('%s/%s' % (outputDirectory,hashName(self.articleName)), 'w')
outputFile.write('<html><head><title>%s</title><meta http-equiv="Content-Type" content="text/html; charset=UTF-8" /></head><body>' % self.articleName)
# Breadcrumb
cursor = articleName
breadcrumb = []
while(cursor in isPartOfs):
isPartOf = isPartOfs[cursor]
breadcrumb.append(isPartOf)
if len(breadcrumb) > 100:
print "IsPartOf circular reference detected: " + '←'.join(breadcrumb)
break
cursor = isPartOf
if len(breadcrumb) > 0:
outputFile.write('<p><i>')
buffer = ""
for cursor in breadcrumb:
buffer = ' → <a href="../' + hashName(cursor) + '"> ' + cursor + '</a>' + buffer
outputFile.write(buffer)
outputFile.write('</i></p>')
body = ""
menuItems = [] # Contains internal links to History, See, Eat, etc
lastLineWasBlank = True
restOfWikicode = self.wikicode
while 1:
# Read one line from the article.
if len(restOfWikicode)==0: break
split = restOfWikicode.partition('\n')
line = split[0]
restOfWikicode = split[2]
# Image and interwiki links (ignored).
if re.compile('^\[\[[^\]]*:').match(line):
continue
# Region template (only display region wikilink and description).
if re.compile('^\s*region[0-9]*color.*\|\s*$').match(line): #Ignore region color
continue
if re.compile('^\s*region[0-9]*items.*\|\s*$').match(line): #Ignore region items
continue
if re.compile('^\s*region[0-9]*name=\[\[[^\]]*\]\]\s*\|\s*$').match(line): # Leave only the wikilink, which will be processed afterwards.
line = re.compile('^\s*region[0-9]*name=').sub('', line)
line = re.compile('\s*\|\s*$').sub('', line)
if re.compile('^\s*region[0-9]*description.*\|\s*$').match(line): # Leave only description.
line = re.compile('^\s*region[0-9]*description=').sub(' ', line)
line = re.compile(' \|').sub('', line)
# Template (just print lines content).
if re.compile('^\{\{').match(line):
continue
if re.compile('^\|').match(line):
line=re.compile('^\|[^=]*=').sub('',line)
if re.compile('^\}\}').match(line):
continue
# Comment (ignored)
line = re.compile('<![^<>]*>').sub('', line) # does not seem to work
if re.compile('^<!--').match(line): # does not seem to work
continue
# Blank line.
if re.compile('^\s*$').match(line):
if lastLineWasBlank:
continue
else:
line = '<p>'
lastLineWasBlank = True
else:
lastLineWasBlank = False
# Header.
if re.compile('^\s*=====.*=====\s*$').match(line):
line = re.compile('^(\s*=====\s*)').sub('<h5>',line)
line = re.compile('(\s*=====\s*)$').sub('</h5>',line)
if re.compile('^\s*====.*====\s*$').match(line):
line = re.compile('^(\s*====\s*)').sub('<h4>',line)
line = re.compile('(\s*====\s*)$').sub('</h4>',line)
if re.compile('^\s*===.*===\s*$').match(line):
line = re.compile('^(\s*===\s*)').sub('<h3>',line)
line = re.compile('(\s*===\s*)$').sub('</h3>',line)
if re.compile('^\s*==.*==\s*$').match(line):
line = re.compile('^(\s*==\s*)').sub('',line)
line = re.compile('(\s*==\s*)$').sub('',line)
menuItems.append(line)
line = "<a name=\"" + str(len(menuItems) - 1) + "\"></a><h2>" + line + "</h2>"
# List item.
if re.compile('^\*').match(line):
line = re.compile('^(\*)').sub('<li>',line)
line = line+'</li>'
# Wikilinks.
if re.compile('.*\]\].*').match(line):
# Contains at least one wikilink. Let's split the line and process one wikilink at a time.
restOfLine = line
line = ""
while 1:
# Split one portion from the line.
if len(restOfLine)==0: break
split = restOfLine.partition(']]')
portion = split[0]
restOfLine = split[2]
# Process this portion
#print "parsing, portion:"+portion
split = portion.partition('[[')
text = split[0]
wikilink = split[2]
line = line+text
# Parse the inside of the wikilink
target = wikilink
label = wikilink
if '|' in wikilink:
split = wikilink.partition("|")
target = split[0].strip()
label = split[2].strip()
# Create link only if the article exists.
target = redirects.get(target, target) # Redirected target, or if inexistent the target itself
if label: # Ignore if label is empty
if target in articleNames:
line += '<a href="../' + hashName(target) + '">' + label + '</a>'
else:
# Don't create a link, because it would be a broken link.
line += '<font color="red">' + label + '</font>'
# External links.
# TODO
if re.compile('.*\].*').match(line):
# Contains at least one wikilink. Let's split the line and process one wikilink at a time.
restOfLine = line
line = ""
while 1:
# Split one portion from the line.
if len(restOfLine)==0: break
split = restOfLine.partition(']')
portion = split[0]
restOfLine = split[2]
# Process this portion
split = portion.partition('[')
text = split[0]
extlink = split[2]
line = line+text
# Parse the inside of the wikilink
target = extlink
label = ""
if " " in extlink:
split = extlink.partition(" ")
target = split[0].strip()
label = split[2].strip()
if extlink:
line += '<a href="' + target + '">[' + label + '↗]</a>'
# Old-style listing.
if re.compile('^<li>\s*(<|<)(see|do|buy|eat|drink|sleep).*(<|>)/.*').match(line):
# Opening tag containing interesting attributes.
line = re.compile('^<li>\s*(<|<)(see|do|buy|eat|drink|sleep)[^\s]* [^\s]*="').sub('<li>',line)
line = re.compile('" [^\s]*="[^"]').sub('. ', line)
line = re.compile('" [^\s]*="').sub('', line)
line = re.compile('"\s*(>|>)').sub('. ', line)
# Closing tag.
line = re.compile('</.*>').sub('', line)
line = re.compile('</.*>').sub('', line)
# New-style listing.
# Coordinates
if re.compile('.*lat=[-0-9][^ ]* \\| long=[-0-9].*').match(line):
coords = re.search('.*lat=([^ ]*) \\| long=([^ ]*).*', line, re.I | re.U)
lat = coords.group(1)
lon = coords.group(2)
line = line + ' <a href="geo:' + lat + ',' + lon + '">(map)</a>'
# TODO: Rest of new listing. Difficult because multi-line
# Bold: remove.
line=re.compile("'''").sub("", line)
# Italic: remove.
line=re.compile("''").sub("", line)
if minimization:
line = re.compile('\s+').sub(' ', line)
body += line
if not minimization:
body += '\n'
# Menu
outputFile.write("<ul>")
for index, menuItem in enumerate(menuItems):
link = "<li><a href=\"#" + str(index) + "\">" + menuItem + "</a></li>"
outputFile.write(link)
outputFile.write("</ul>")
outputFile.write(body)
outputFile.write('</body></html>')
# End of Article class
# Main
print "### Generate index"
articles = ["Africa", "Antarctica", "Asia", "South Asia", "Southeast Asia", "Caribbean", "Central America", "Europe", "Middle East", "North America", "South America", "Other destinations", "Travel topics"]
index = open("index.html", "w")
index.write("<html> <head><title>OxygenGuide</title></head> <body> <ul>")
for article in articles:
index.write('<li><a href="articles/')
index.write(hashName(article))
index.write('">')
index.write(article)
index.write('</a></li>')
index.write('</ul>')
index.write('<p>This content is based on work by all volunteers of <a href="http://wikivoyage.org">Wikivoyage</a> and <a href="http://wikitravel.org">Wikitravel</a>.')
index.write('Text is available under <a href="http://creativecommons.org/licenses/by-sa/1.0/">Creative Commons Attribution-ShareAlike 1.0</a>.')
index.write('Comments welcome on <a href="https://en.wikivoyage.org/w/index.php?title=User_talk:Nicolas1981&action=edit§ion=new">my user page</a>.</p>')
index.write('</body> </html>')
# Create the directory where HTML files will be written.
if not os.path.isdir(outputDirectory):
os.mkdir(outputDirectory)
print "### Build list of articles and map of redirects"
redirects = {}
articleNames = []
isPartOfs = {}
redirect = 0
isPartOf = 0
for line in open(databaseDump):
if line.startswith(" <title>"):
articleName = line.partition('>')[2].partition('<')[0]
if line.startswith(" <redirect"):
redirect = 1
target = line.partition('"')[2].partition('"')[0].partition('#')[0]
if line.startswith("{{IsPartOf|") or line.startswith("{{isPartOf|"):
isPartOf = line[11:].partition('}')[0]
isPartOf = isPartOf.replace("_", " ")
if line.startswith("{{IsIn|") or line.startswith("{{isIn|"):
isPartOf = line[7:].partition('}')[0]
isPartOf = isPartOf.replace("_", " ")
if line.startswith(" </page>"):
if(redirect):
#print "New redirect: " + articleName + " to " + target
redirects[articleName] = target
else:
#print "New article: " + articleName
articleNames.append(articleName)
if(isPartOf != 0):
isPartOfs[articleName] = isPartOf
redirect = 0
isPartOf = 0
print str(len(redirects)) + " redirects"
print str(len(articleNames)) + " articles"
print str(len(isPartOfs)) + " articles with breadcrumb"
# if is_redirect_line(line):
# # Get the wikilink of the REDIRECT
# target = line.partition('[[')[2].partition(']]')[0].partition('#')[0]
# # Substitute underscores with spaces
# target = re.compile('_').sub(' ', target)
# #print "Redirect from " + articleName + " to " + target
# # Add to dictionary
# redirects[articleName] = target
# else:
# articleNames.append(articleName)
# if line.startswith(" <title>"):
# articleName = line.partition('>')[2].partition('<')[0]
print "### Check for double-redirects"
for (name,target) in redirects.items():
if target in redirects:
print "# Double redirect detected, please fix: [[" + name + "]] > [[" + target + "]] > [[" + redirects[target] + "]]"
print "### Generate articles"
flag=0;skip=0
for line in open(databaseDump):
if line.startswith(" <title>"):
if "/Gpx" in line or ":" in line: # Skip GPS traces and articles such as Template: Title: Wikivoyage:
skip=1
else:
articleName = re.compile(' <title>').sub('', line)
articleName = re.compile('</title>.*', re.DOTALL).sub('', articleName)
if line.startswith(" </page>"):
flag=0
if skip:
skip=0
else:
wikicode = re.compile('.*preserve">', re.DOTALL).sub('', page)
if not is_redirect(wikicode):
wikicode = re.compile(' <sha1>.*', re.DOTALL).sub('', wikicode)
article = Article(wikicode, articleName);
article.saveHTML();
if line.startswith(" <page>"):
flag=1
page=""
if flag and not line.startswith(" <page>"):
page += line