forked from KermMartian/wikidot-to-mediawiki
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwikidot.py
executable file
·270 lines (244 loc) · 13.4 KB
/
wikidot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
#!/usr/bin/env python
# -*- encoding: UTF8 -*-
# Copyright 2012 Philipp Klaus
# Part of https://github.com/vLj2/wikidot-to-markdown
# Improved 2016 by Christopher Mitchell
# https://github.com/KermMartian/wikidot-to-markdown
# Improved 2022 by Matthew Walker
# https://github.com/bodekerscientific/wikidot-to-mediawiki
import regex as re
import uuid ## to generate random UUIDs using uuid.uuid4()
class WikidotToMediaWiki():
def __init__(self):
# regex for URL found on http://regexlib.com/REDetails.aspx?regex_id=501
self.url_regex = r"(http|https|ftp)\://([a-zA-Z0-9\.\-]+(\:[a-zA-Z0-9\.&%\$\-]+)*@)*((25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9])\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]{1}[0-9]{2}|[1-9]{1}[0-9]{1}|[0-9])|localhost|([a-zA-Z0-9\-]+\.)*[a-zA-Z0-9\-]+\.(com|edu|gov|int|mil|net|org|biz|arpa|info|name|pro|aero|coop|museum|[a-zA-Z]{2}))(\:[0-9]+)*(/($|[a-zA-Z0-9\.\,\?\'\\\+&%\$#\=~_\-]+))*[/]?"
self.static_replacements = { '[[toc]]': '', # no equivalent for table of contents in Markdown
}
self.regex_replacements = { r'([^:])//([\s\S ]*?[^:])//': r"\1''\2''", # italics
r'([^:])\*\*([\s\S ]*?)\*\*': r"\1'''\2'''", # bold
r'([^:])\[!--([\s\S ]*?)--\]': r"\1<!--\2-->", # comments
r'([^:])__([\s\S ]*?)__': r"\1'''\2'''", # underlining → bold
#r'([^:]){{([\s\S ]*?)}}': r'\1`\2`', # inline monospaced text
r'\^\^([\S\s]*?)\^\^': r'<sup>\1</sup>', # superscript
r'\[\[size\s*\S*?\]\]': r'', # ignore text sizing
r'\[\[/size\s*?\]\]' : r'', # ignore text sizing
}
self.regex_split_condition = r"^\+ ([^\n]*)$"
def convert(self, text, file_prefix="", fullname_to_title={}):
text = '\n'+text+'\n'# add embed in newlines (makes regex replaces work better)
# first we search for [[code]] statements as we don't want any replacement to happen inside those code blocks!
code_blocks = dict()
code_blocks_found = re.findall(re.compile(r'(\[\[code( type="([\S]+)")?\]\]([\s\S ]*?)\[\[/code\]\])',re.MULTILINE), text)
for code_block_found in code_blocks_found:
tmp_hash = str(uuid.uuid4())
text = text.replace(code_block_found[0],tmp_hash,1) # replace code block with a hash - to fill it in later
code_blocks[tmp_hash] = code_block_found[-1]
for search, replacement in self.static_replacements.items():
text = text.replace(search,replacement,1)
# search for any of the simpler replacements in the dictionary regex_replacements
for s_reg, r_reg in self.regex_replacements.items():
text = re.sub(re.compile(s_reg,re.MULTILINE),r_reg,text)
# TITLES -- replace '+++ X' with '=== X ==='
for titles in re.finditer(r"^(\++)([^\n]*)$", text, re.MULTILINE):
header = ("=" * len(titles.group(1)))
text = text.replace(titles.group(0), header + (titles.group(2) + " ") + header)
# LISTS(*) -- replace ' *' with '***' and so on
for stars in re.finditer(r"^([ \t]+)\*", text, re.MULTILINE):
text = text[:stars.start(1)] + ("*" * len(stars.group(1))) + text[stars.end(1):]
# LISTS(#) -- replace ' #' with '###' and so on
for hashes in re.finditer(r"^([ \t]+)\*", text, re.MULTILINE):
text = text[:hashes.start(1)] + ("#" * len(hashes.group(1))) + text[hashes.end(1):]
# Internal links -- replace [[[internal link]]] with [[internal link]], informed
# by mapping from fullname to title
lower_title_to_case_sensitive_title = {
title.lower(): title
for title in fullname_to_title.values()
}
def convert_internal_link(link: str):
# Trim whitespace at either end
link = link.strip()
# Convert underscores and at-signs to hypens (because that's what Wikidot did)
fullname = re.sub(r"[_@]", r"-", link)
# Convert link to lower case (because that's what Wikidot did)
fullname = fullname.lower()
if fullname in fullname_to_title:
return fullname_to_title[fullname]
else:
# Check if link references page title rather than its fullname
#title = re.sub(" ", "_", link)
lower_title = link.lower() # Wikidot seems to be case-insensitive
if lower_title in lower_title_to_case_sensitive_title:
return lower_title_to_case_sensitive_title[lower_title]
print(f" Failed to find page given the link '{link}'")
return link
internal_links = []
for inlink in re.finditer(r"\[\[\[([\s\S]*?)\]\]\]", text):
inlink_contents = inlink.group(1)
# Check for existance of alternative text
pattern = re.compile(r"([\S\s]*?)[\s]*\|[\s]*([\S\s]*?)")
match = pattern.fullmatch(inlink_contents)
if match is not None:
# Contents contains alternative text
internal_page = convert_internal_link(match.group(1))
alt_text = match.group(2)
replacement_contents = f"[[{internal_page}|{alt_text}]]"
else:
# Contents must be only the name of the internal page
internal_page = convert_internal_link(inlink_contents)
replacement_contents = f"[[{internal_page}]]"
text = text.replace(inlink.group(0), replacement_contents)
internal_links.append(internal_page)
# Image
linked_files = []
for image in re.finditer(r"\[\[(f?[=<>]?)image\s*([\S\s]*?)\s*\]\]", text):
image_format = image.group(1)
image_contents = image.group(2)
# Process the horizontal alignment of the image
options = []
if image_format == "<" or image_format == "f<":
options.append("left")
if image_format == ">" or image_format == "f>":
options.append("right")
if image_format == "=":
options.append("center")
# Process any attributes
filename = image_contents
for attribute in re.finditer(r"(\S*?)=\"([\S\s]*?)\"", image_contents, re.MULTILINE):
key = attribute.group(1)
value = attribute.group(2)
if key == "width":
options.append(value)
if key == "height":
options.append("x"+value)
if key == "size":
if value == "square":
options.append("75x75px")
if value == "thumbnail":
options.append("100px")
if value == "small":
options.append("240px")
if value == "medium":
options.append("500px")
if value == "medium640":
options.append("640px")
if value == "large":
options.append("1024px")
if key == "link":
options.append(f"link={value.lstrip()}")
if key == "alt":
options.append(f"alt={value.lstrip()}")
filename = re.sub(attribute.group(0), "", filename)
original_filename = filename.strip()
filename = file_prefix + original_filename
file_contents = "|".join([filename] + options)
text = text.replace(image.group(0), "[[File:" + file_contents + "]]")
linked_files.append(original_filename)
# Gallery
for gallery in re.finditer(r"\[\[gallery[ \S]*?\]\]([\S\s ]*)\[\[/gallery\]\]", text, re.MULTILINE):
replacement_gallery = "<gallery>\n"
gallery_content = gallery.group(1)
for filename_match in re.finditer(r"^: ([\S]*)", gallery_content, re.MULTILINE):
original_filename = filename_match.group(1)
filename = file_prefix + original_filename
replacement_gallery += filename + "\n"
linked_files.append(original_filename)
replacement_gallery += "</gallery>"
text = text.replace(gallery.group(0), replacement_gallery)
# File
for file in re.finditer(r"\[\[file[\s]*([\S\s]*?)[\s]*\]\]", text, re.MULTILINE):
file_contents = file.group(1)
# Check for existance of alternative text
pattern = re.compile(r"([\S\s]*?)[\s]*\|[\s]*([\S\s]*?)")
match = pattern.fullmatch(file_contents)
if match is not None:
# Contents contains alternative text
original_filename = match.group(1)
filename = file_prefix + original_filename
alt_text = match.group(2)
replacement_contents = f"[[Media:{filename}|{alt_text}]]"
else:
# Contents must be only the filename
original_filename = file_contents
filename = file_prefix + original_filename
replacement_contents = f"[[Media:{filename}|{filename}]]"
text = text.replace(file.group(0), replacement_contents)
linked_files.append(original_filename)
# START TABLE
for table in re.finditer(r"\[\[table([\s\S ]*?)\]\]", text):
#text = text.replace(table.group(0), "{|" + table.group(1))
text = text.replace(table.group(0), "{|")
# START ROW
for row in re.finditer(r"\[\[row([\s\S ]*?)\]\]", text):
#text = text.replace(row.group(0), "|-" + row.group(1))
text = text.replace(row.group(0), "|-")
# START CELL
for cell in re.finditer(r"\[\[cell([\s\S ]*?)\]\]", text):
#text = text.replace(cell.group(0), "|" + cell.group(1))
text = text.replace(cell.group(0), "|")
# ENDS
for end in re.finditer(r"\[\[/([\s\S ]*?)\]\]", text):
token = end.group(1)
if token == "table":
text = text.replace(end.group(0), "|}")
elif token == "row":
# end row tabs are not necessary in mediawiki
text = text.replace(end.group(0), "")
elif token == "cell":
# end cell tabs are not necessary in mediawiki
text = text.replace(end.group(0), "")
# Substitute back our code blocks
for tmp_hash, code in code_blocks.items():
code = "\n <nowiki>"+code+"</nowiki>"
text = text.replace(tmp_hash, code, 1)
# Process color corrections
startpos = 0
while -1 != startpos:
startpos = text.find("##", startpos)
pipepos = text.find("|", startpos + 2)
endpos = text.find("##", startpos + 2)
if startpos != -1 and pipepos != -1 and endpos != -1 and endpos > pipepos:
color = text[startpos + 2 : pipepos].strip()
colored = text[pipepos + 1 : endpos].strip()
text = text[: startpos] + "<span style=\"color:" + color + "\">" + colored + \
"</span>" + text[endpos + 2 :]
startpos = endpos
# Process math corrections
startpos = 0
while -1 != startpos:
startpos = text.find("[[$", startpos)
endpos = text.find("$]]", startpos)
if startpos != -1 and endpos != -1:
mathtext = text[startpos + 3 : endpos].strip()
text = text[: startpos] + "<math>" + mathtext + "</math>" + text[endpos + 3 :]
startpos = endpos
# Process table corrections
startpos = 0
while -1 != startpos:
startpos = text.find("\n||", startpos)
if startpos == -1:
break
# Find end of table
endpos = text.find("\n", startpos + 3)
while endpos < len(text) - 3 and "||" == text[endpos + 1: endpos + 3]:
endpos = text.find("\n", endpos + 3)
# Found bounds of text chunk: reformat table
fixup = text[startpos + 1 : endpos].replace("||~", "!!")
fixup = fixup.split("\n")
fixout = ["", "{| class=\"wikitable\""]
for i in range(len(fixup)):
if fixup[i][0 : 2] == "||" or fixup[i][0 : 2] == "!!":
out = fixup[i].strip()[1 : ]
fixout.append(out[ : -2 if out[-2 : ] in ["||", "!!"] else 0])
else:
message = "Failed to parse item %d/%d: '%s'" % (i, len(fixup), fixup[i])
raise Exception(message)
fixout.append("|}" if i == len(fixup) - 1 else "|-")
# Construct output table text
fullout = "\n".join(fixout)
text = text[ : startpos] + fullout + text[endpos : ]
startpos = startpos + len(fullout)
# Repair multi-newlines
text = re.sub(r"\n\n+", "\n\n", text, re.M)
# Repair starting newlines
text = text.strip()
return text, internal_links, linked_files