-
Notifications
You must be signed in to change notification settings - Fork 1
/
extract_de_ipa_en.py
60 lines (48 loc) · 1.86 KB
/
extract_de_ipa_en.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
pageTitle = ""
pronFound = False
format = "csv"
with open('enwiktionary-20220120-pages-meta-current.xml', newline='') as f:
while True:
line = f.readline()
if not line:
break
if line.find("</page>") > -1:
pageTitle = ""
pronFound = False
continue
if line.find("<title>") > -1:
pt = line.replace("<title>", "").replace("</title>", "").strip()
if pt.find(":") > -1: # skip pages in namespaces
continue
pageTitle = pt
continue
if pageTitle == "":
continue
if line.startswith("===Pronunciation==="):
pronFound = True
elif line.startswith("=="):
pronFound = False
if line.startswith("=="):
continue
if not pronFound:
continue
if line.startswith("* {{IPA|de|") or line.startswith("** {{IPA|de|"):
parts = line.strip().split("{{")
ipa = parts[1].replace("IPA|de|", "").replace("}}", "").replace("|", ",").strip()
if format == "all":
print("page: " + pageTitle + "; IPA de: " + ipa
+ (("; " + (", ".join(
map(lambda p: p.replace("|", ":").replace("}}", "").strip(), parts[2:]))))
if len(parts) > 2 else ""))
if format == "csv":
print(pageTitle + ","
+ (ipa if ipa.find(",")==-1 else ("\"" + ipa + "\""))
+ ((",\""
+ (", ".join(
map(lambda p:
p.replace("|", ":").replace("}}", "").replace("\"", "'").strip(),
parts[2:])))
+ "\"")
if len(parts) > 2 else "")
)
f.close()