Skip to content

Commit

Permalink
Fix flake8 and pylint warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
p12tic committed Oct 24, 2018
1 parent 96fefd1 commit d9bd30f
Show file tree
Hide file tree
Showing 36 changed files with 949 additions and 610 deletions.
13 changes: 9 additions & 4 deletions build_link_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import os
from link_map import LinkMap


# returns a dict { title -> filename }.
# directory - either 'output/reference' or 'reference'
def build_link_map(directory):
Expand All @@ -41,26 +42,30 @@ def build_link_map(directory):
text = f.read()
f.close()

m = re.search('<script>[^<]*mw\.config\.set([^<]*wgPageName[^<]*)</script>', text)
m = re.search(r'<script>[^<]*mw\.config\.set([^<]*wgPageName[^<]*)</script>', text) # noqa
if not m:
continue
text = m.group(1)
text = re.sub('\s*', '', text)
text = re.sub(r'\s*', '', text)
m = re.search('"wgPageName":"([^"]*)"', text)
if not m:
continue

title = m.group(1)

target = os.path.relpath(os.path.abspath(fn), os.path.abspath(directory))
target = os.path.relpath(os.path.abspath(fn),
os.path.abspath(directory))
link_map.add_link(title, target)
return link_map


def main():
link_map = build_link_map('output/reference')

# create an xml file containing mapping between page title and actual location
# create an xml file containing mapping between page title and actual
# location
link_map.write('output/link-map.xml')


if __name__ == "__main__":
main()
102 changes: 70 additions & 32 deletions commands/preprocess.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,23 +19,24 @@

import fnmatch
import io
from lxml import etree
import re
import os
import sys
import shutil
import urllib.parse
from xml_utils import xml_escape, xml_unescape
from lxml import etree


def rmtree_if_exists(dir):
if os.path.isdir(dir):
shutil.rmtree(dir)


def move_dir_contents_to_dir(srcdir, dstdir):
for fn in os.listdir(srcdir):
shutil.move(os.path.join(srcdir, fn),
os.path.join(dstdir, fn))


def rearrange_archive(root):
# rearrange the archive. {root} here is output/reference

Expand Down Expand Up @@ -71,8 +72,10 @@ def rearrange_archive(root):
move_dir_contents_to_dir(src_data_path, data_path)

# also copy the custom fonts
shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed60.ttf'), data_path)
shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed75.ttf'), data_path)
shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed60.ttf'),
data_path)
shutil.copy(os.path.join(path, 'DejaVuSansMonoCondensed75.ttf'),
data_path)

# remove what's left
shutil.rmtree(path)
Expand All @@ -81,20 +84,23 @@ def rearrange_archive(root):
for fn in fnmatch.filter(os.listdir(root), 'cppreference-export*.xml'):
os.remove(os.path.join(root, fn))

# Converts complex URL to resources supplied by MediaWiki loader to a simplified name

def convert_loader_name(fn):
# Converts complex URL to resources supplied by MediaWiki loader to a
# simplified name
if "modules=site&only=scripts" in fn:
return "site_scripts.js"
elif "modules=site&only=styles" in fn:
if "modules=site&only=styles" in fn:
return "site_modules.css"
elif "modules=startup&only=scripts" in fn:
if "modules=startup&only=scripts" in fn:
return "startup_scripts.js"
elif re.search("modules=skins.*&only=scripts", fn):
if re.search("modules=skins.*&only=scripts", fn):
return "skin_scripts.js"
elif re.search("modules=.*ext.*&only=styles", fn):
if re.search("modules=.*ext.*&only=styles", fn):
return "ext.css"
else:
raise Exception('Loader file {0} does not match any known files'.format(fn))
msg = 'Loader file {0} does not match any known files'.format(fn)
raise Exception(msg)


def build_rename_map(root):
# Returns a rename map: a map from old to new file name
Expand Down Expand Up @@ -122,13 +128,17 @@ def build_rename_map(root):
if num > 0:
name, ext = os.path.splitext(fn)
# add file with its path -> only rename that occurrence
result[os.path.join(dir, fn)] = "{}.{}{}".format(name, num + 1, ext)
result[os.path.join(dir, fn)] = "{}.{}{}".format(name, num + 1,
ext)
seen[low] += 1

return result


def rename_files(root, rename_map):
for dir, old_fn in ((dir, fn) for dir, _, filenames in os.walk(root) for fn in filenames):
for dir, old_fn in ((dir, fn)
for dir, _, filenames in os.walk(root)
for fn in filenames):
src_path = os.path.join(dir, old_fn)

new_fn = rename_map.get(old_fn)
Expand All @@ -144,6 +154,7 @@ def rename_files(root, rename_map):
print("Renaming {0}\n to {1}".format(src_path, dst_path))
shutil.move(src_path, dst_path)


def find_html_files(root):
# find files that need to be preprocessed
html_files = []
Expand All @@ -152,21 +163,25 @@ def find_html_files(root):
html_files.append(os.path.join(dir, filename))
return html_files


def is_loader_link(target):
if re.match(r'https?://[a-z]+\.cppreference\.com/mwiki/load\.php', target):
return True
return False


def transform_loader_link(target, file, root):
# Absolute loader.php links need to be made relative
abstarget = os.path.join(root, "common", convert_loader_name(target))
return os.path.relpath(abstarget, os.path.dirname(file))


def is_ranges_placeholder(target):
if re.match(r'https?://[a-z]+\.cppreference\.com/w/cpp/ranges(-[a-z]+)?-placeholder/.+', target):
if re.match(r'https?://[a-z]+\.cppreference\.com/w/cpp/ranges(-[a-z]+)?-placeholder/.+', target): # noqa
return True
return False


def transform_ranges_placeholder(target, file, root):
# Placeholder link replacement is implemented in the MediaWiki site JS at
# https://en.cppreference.com/w/MediaWiki:Common.js
Expand All @@ -175,9 +190,9 @@ def transform_ranges_placeholder(target, file, root):
repl = (r'\1/cpp/experimental/ranges/\2' if ranges else r'\1/cpp/\2')

if 'ranges-placeholder' in target:
match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-placeholder/(.+)'
match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-placeholder/(.+)' # noqa
else:
match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-([a-z]+)-placeholder/(.+)'
match = r'https?://([a-z]+)\.cppreference\.com/w/cpp/ranges-([a-z]+)-placeholder/(.+)' # noqa
repl += (r'/\3' if ranges else r'/ranges/\3')

# Turn absolute placeholder link into site-relative link
Expand All @@ -187,24 +202,27 @@ def transform_ranges_placeholder(target, file, root):
abstarget = os.path.join(root, reltarget)
return os.path.relpath(abstarget, os.path.dirname(file))


def is_external_link(target):
url = urllib.parse.urlparse(target)
return url.scheme != '' or url.netloc != ''


def trasform_relative_link(rename_map, target, file):
# urlparse returns (scheme, host, path, params, query, fragment)
_, _, path, params, _, fragment = urllib.parse.urlparse(target)
assert params == ''

path = urllib.parse.unquote(path)
path = path.replace('../../upload.cppreference.com/mwiki/','../common/')
path = path.replace('../mwiki/','../common/')
path = path.replace('../../upload.cppreference.com/mwiki/', '../common/')
path = path.replace('../mwiki/', '../common/')

dir, fn = os.path.split(path)
new_fn = rename_map.get(fn)
if new_fn:
# look for case conflict of the renamed file
abstarget = os.path.normpath(os.path.join(os.path.dirname(file), dir, new_fn))
abstarget = os.path.normpath(os.path.join(os.path.dirname(file),
dir, new_fn))
new_fn = rename_map.get(abstarget, new_fn)
else:
# original filename unchanged, look for case conflict
Expand All @@ -216,11 +234,13 @@ def trasform_relative_link(rename_map, target, file):
path = urllib.parse.quote(path)
return urllib.parse.urlunparse(('', '', path, params, '', fragment))


# Transforms a link in the given file according to rename map.
# target is the link to transform.
# file is the path of the file the link came from.
# root is the path to the root of the archive.
def transform_link(rename_map, target, file, root):

if is_loader_link(target):
return transform_loader_link(target, file, root)

Expand All @@ -232,6 +252,7 @@ def transform_link(rename_map, target, file, root):

return trasform_relative_link(rename_map, target, file)


def has_class(el, *classes_to_check):
value = el.get('class')
if value is None:
Expand All @@ -242,6 +263,7 @@ def has_class(el, *classes_to_check):
return True
return False


# remove non-printable elements
def remove_noprint(html):
for el in html.xpath('//*'):
Expand All @@ -250,14 +272,16 @@ def remove_noprint(html):
elif el.get('id') in ['toc', 'catlinks']:
el.getparent().remove(el)


# remove see also links between C and C++ documentations
def remove_see_also(html):
for el in html.xpath('//tr[@class]'):
if not has_class(el, 't-dcl-list-item', 't-dsc'):
continue

child_tds = el.xpath('.//td/div[@class]')
if not any(has_class(td, 't-dcl-list-see', 't-dsc-see') for td in child_tds):
if not any(has_class(td, 't-dcl-list-see', 't-dsc-see')
for td in child_tds):
continue

# remove preceding separator, if any
Expand All @@ -276,17 +300,23 @@ def remove_see_also(html):
next = el.getnext()
if next is None:
el.getparent().remove(el)
elif next.tag == 'table' and has_class(next, 't-dcl-list-begin') and len(next.xpath('.//tr')) == 0:
elif next.tag == 'table' and has_class(next, 't-dcl-list-begin') and \
len(next.xpath('.//tr')) == 0:
el.getparent().remove(el)
next.getparent().remove(next)


# remove Google Analytics scripts
def remove_google_analytics(html):
for el in html.xpath('/html/body/script'):
if el.get('src') is not None and 'google-analytics.com/ga.js' in el.get('src'):
el.getparent().remove(el)
elif el.text is not None and ('google-analytics.com/ga.js' in el.text or 'pageTracker' in el.text):
el.getparent().remove(el)
if el.get('src') is not None:
if 'google-analytics.com/ga.js' in el.get('src'):
el.getparent().remove(el)
elif el.text is not None:
if 'google-analytics.com/ga.js' in el.text or \
'pageTracker' in el.text:
el.getparent().remove(el)


# remove Carbon ads
def remove_ads(html):
Expand All @@ -297,13 +327,15 @@ def remove_ads(html):
if el.text is not None and '#carbonads' in el.text:
el.getparent().remove(el)


# remove links to file info pages (e.g. on images)
def remove_fileinfo(html):
info = etree.XPath(r"//a[re:test(@href, 'https?://[a-z]+\.cppreference\.com/w/File:')]/..",
namespaces={'re':'http://exslt.org/regular-expressions'})
info = etree.XPath(r"//a[re:test(@href, 'https?://[a-z]+\.cppreference\.com/w/File:')]/..", # noqa
namespaces={'re':'http://exslt.org/regular-expressions'}) # noqa
for el in info(html):
el.getparent().remove(el)


# remove external links to unused resources
def remove_unused_external(html):
for el in html.xpath('/html/head/link'):
Expand All @@ -313,6 +345,7 @@ def remove_unused_external(html):
(head, tail) = os.path.split(el.get('href'))
el.set('href', os.path.join(head, 'common', tail))


def preprocess_html_file(root, fn, rename_map):
parser = etree.HTMLParser()
html = etree.parse(fn, parser)
Expand All @@ -331,23 +364,27 @@ def preprocess_html_file(root, fn, rename_map):
for el in html.xpath('//*[@href]'):
el.set('href', transform_link(rename_map, el.get('href'), fn, root))

for err in parser.error_log:
for err in list(parser.error_log):
print("HTML WARN: {0}".format(err), file=output)

html.write(fn, encoding='utf-8', method='html')
return output.getvalue()


def preprocess_css_file(fn):
f = open(fn, "r", encoding='utf-8')
text = f.read()
f.close()

# note that query string is not used in css files

text = text.replace('../DejaVuSansMonoCondensed60.ttf', 'DejaVuSansMonoCondensed60.ttf')
text = text.replace('../DejaVuSansMonoCondensed75.ttf', 'DejaVuSansMonoCondensed75.ttf')
text = text.replace('../DejaVuSansMonoCondensed60.ttf',
'DejaVuSansMonoCondensed60.ttf')
text = text.replace('../DejaVuSansMonoCondensed75.ttf',
'DejaVuSansMonoCondensed75.ttf')

text = text.replace('../../upload.cppreference.com/mwiki/images/', 'images/')
text = text.replace('../../upload.cppreference.com/mwiki/images/',
'images/')

# QT Help viewer doesn't understand nth-child
text = text.replace('nth-child(1)', 'first-child')
Expand All @@ -356,6 +393,7 @@ def preprocess_css_file(fn):
f.write(text)
f.close()


def preprocess_startup_script(fn):
with open(fn, "r", encoding='utf-8') as f:
text = f.read()
Expand Down
Loading

0 comments on commit d9bd30f

Please sign in to comment.