-
Notifications
You must be signed in to change notification settings - Fork 0
/
big_dumpparser.py
115 lines (98 loc) · 3.91 KB
/
big_dumpparser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re, os, sys
import json, requests
import pywikibot, operator
import html
import toolforge
from pywikibot import xmlreader
from pywikibot import textlib
from bz2 import BZ2File
from collections import Counter
from checkDump import getLastFile, setLastDump
conn = toolforge.connect('lvwiki_p')
site = pywikibot.Site("lv", "wikipedia")
xmlfile = '../../lvwiki-20180501-pages-articles.xml'
curdate = '2018-05-01'
#nākošais
check_nakosais = re.compile(r"(?<!(ie|iz|pa))(?<!pie)(nākoš)(?!(ā|ā misija|ā turneja|ais_p|)\s*=)", flags=re.I)
file_nakosais = ''#open(r"reps/lv-dumpscan-nakos-"+curdate+".txt", "w", encoding='utf-8')
mas_nakosais = []
title_nakosais = 'Dalībnieks:Edgars2007/Aka/Typo/Nākošais'
#isbn
check_isbn = re.compile(r"(?<![\|{])\s*isbn(?!\s*=)", flags=re.I)
file_isbn = ''#open(r"reps/lv-dumpscan-isbn-"+curdate+".txt", "w", encoding='utf-8')
mas_isbn = []
title_isbn = 'Dalībnieks:Edgars2007/ISBN/Plain'
#dubes - dubulti vārdi
#fixme: biotakso rakstus ignorēt
check_dubes = re.compile(r"\s(([A-Za-zĀČĒĢĪĶĻŅŠŪŽāčēģīķļņšūž]{3,})\s+\2)\s", flags=re.I)
file_dubes = ''#open(r"reps/lv-dumpscan-dubes-"+curdate+".txt", "w", encoding='utf-8')
mas_dubes = []
title_dubes = 'Dalībnieks:Edgars2007/Aka/Divi vienādi vārdi pēc kārtas'
#sekojoš
check_sekoj = re.compile(r"(sekojoš)", flags=re.I)
file_sekoj = ''#open(r"reps/lv-dumpscan-sekoj-"+curdate+".txt", "w", encoding='utf-8')
mas_sekoj = []
title_sekoj = 'Dalībnieks:Edgars2007/Aka/Typo/Sekojošais'
#years
check_years = re.compile(r"(\[\[.*?(\d{4}).*?\|(?!\2)\d{4}\]\])", flags=re.I)
file_years = ''#open(r"reps/lv-dumpscan-years-"+curdate+".txt", "w", encoding='utf-8')
mas_years = []
title_years = 'Dalībnieks:Edgars2007/Aka/Gadu saites'#nevajag kontekstu
def getLastFile1():
return getLastFile(jobname='dumpsOther')
def parse_findings(regex,pagetext,pagetitle,file,data):
m = regex.finditer(pagetext)
context = 30
finds = []
if m:
for found in m:
sectionname = '<nowiki>'+pagetext[max(0, found.start() - context):found.start()]+'</nowiki><span style="background-color:#fdd;padding:2px;margin:1px">\'\'\''+pagetext[found.start():found.end()]+'\'\'\'</span><nowiki>'+pagetext[found.end() : found.end() + context] + '</nowiki>'
sectionname = sectionname.replace('\n',' ')
sectionname = re.sub('\s\s*',' ',sectionname)
finds.append(sectionname)
if len(finds)>0:
#file.write('* [['+pagetitle+']]: '+', '.join(finds)+'\n')
data.append([pagetitle,', '.join(finds)])
#
def parseFile(fileToParse):
num = 0
with BZ2File(fileToParse) as xml_file:
for page in xmlreader.XmlDump(fileToParse).parse():
if page.ns == "0" and not page.isredirect:
num += 1
if num % 2500 == 0:
print(num)
pagetext = html.unescape(page.text)
pagetitle = page.title
#nākošais
parse_findings(check_nakosais,pagetext,pagetitle,file_nakosais,mas_nakosais)
#isbn
parse_findings(check_isbn,pagetext,pagetitle,file_isbn,mas_isbn)
#dubes
parse_findings(check_dubes,pagetext,pagetitle,file_dubes,mas_dubes)
#sekojos
parse_findings(check_sekoj,pagetext,pagetitle,file_sekoj,mas_sekoj)
#years
#parse_findings(check_years,pagetext,pagetitle,file_years,mas_years)
#
def putWiki():
for one in [(mas_nakosais,title_nakosais),(mas_isbn,title_isbn),(mas_dubes,title_dubes),(mas_sekoj,title_sekoj),(mas_years,title_years)]:
mas,title = one
if len(mas)>0:
thetext = ["* [[{}]]: {}".format(f[0],f[1]) for f in mas]
page = pywikibot.Page(site,title)
page.text = '\n'.join(thetext)
page.save(comment="Bots: atjaunināts", botflag=False, minor=False)
#
def main():
lastdata = getLastFile1()
if not lastdata: return 0
filelink = lastdata['path']
dateStr = lastdata['date']
parseFile(filelink)
putWiki()
setLastDump(str(dateStr),'dumpsOther')
#
main()