-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdeWikiFrau2.py
151 lines (125 loc) · 3.89 KB
/
deWikiFrau2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import pywikibot, re, os, time, sys, json, operator
import toolforge
from datetime import date, datetime, timedelta, timezone
from pytz import timezone
conn = toolforge.connect('dewiki_p','analytics')
utc_timezone = timezone("UTC")
lva_timezone = timezone("Europe/Riga")
def encode_if_necessary(b):
if type(b) is bytes:
return b.decode('utf8')
return b
def run_query(query):
#query = query.encode('utf-8')
#print(query)
try:
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
except KeyboardInterrupt:
sys.exit()
return rows
#
SQL_main = """select count(l.ll_lang), p.page_title,
(select pp.pp_value from page_props pp where pp.pp_page=p.page_id and pp_propname='wikibase_item') as wd,
(SELECT GROUP_CONCAT(cl_to SEPARATOR '|') FROM categorylinks cl where cl.cl_from=p.page_id and cl_to like "Geboren_%") as cats
from langlinks l
join page p on p.page_id=l.ll_from
and p.page_namespace=0 and not exists (select * from langlinks m where m.ll_from=l.ll_from and m.ll_lang="lv")
and exists (select * from categorylinks cla where cla.cl_type="page" and l.ll_from=cla.cl_from
and cla.cl_to="Frau"
)
group by l.ll_from
order by count(l.ll_lang) desc
limit 90000;"""
def utc_to_local(utc_dt):
return utc_timezone.localize(utc_dt).astimezone(lva_timezone)
#
sql_insert = 'INSERT INTO `entries` (`name`, `group_name`, `jsondata`,`last_upd`) VALUES (%s, %s, %s, %s)'
def encode_all_items(row):
return [encode_if_necessary(f) for f in row]
bigm = {}
def sort_order(themas):
toadd = {}
toadd1 = {}
for one in themas:
if '. gs pme' in one:
toadd.update({one:(int((one.replace('. gs pme','')))+1)*100*-1})
elif '. gs' in one and one.replace('. gs','').isdigit():
toadd.update({one:(int(one.replace('. gs',''))-1)*100})
elif one.replace('s','').isdigit():
toadd.update({one:int(one.replace('s',''))})
else:
toadd1.update({one:one})
sorted_d = sorted(toadd.items(), key=operator.itemgetter(1))
sorted_d1 = sorted(toadd1.items(), key=operator.itemgetter(1))
#pywikibot.output(sorted_d)
#pywikibot.output(sorted_d1)
return sorted_d[::-1]+sorted_d1
#
def centuryFromYear(text,year):
try:
year = int(year)
if year % 100 == 0:
return (year % 100)
else:
return (year % 100) + 1
except:
return 'DIDNOT'
# print(text)
# year = 0
#
def parse_dob(text):
toret = ''
#if isinstance(text, int)
#if len(text)==4:
# toret = text[:3]
if re.match('^\d+$',text):
intdob = int(text)
if intdob>1800:#pēdējās desmitgades sadalīt uz pusēm
toret = text[:-1]+'0s'
else:
toret = str(centuryFromYear(text,text[:-2]))+'. gs'
reg1 = re.search('^im_(\d+)\._Jahrhundert$',text)
if reg1:
toret = reg1.group(1)+'. gs'
reg1 = re.search('^im_(\d+)\._Jahrhundert_v\._Chr\.$',text)
if reg1:
intdob = int(reg1.group(1))
if intdob<51:
toret = reg1.group(1)+'. gs pme'
else:
toret = 'pme'
reg1 = re.search('^(\d+)_v\._Chr\.$',text)
if reg1:
intdob = centuryFromYear(text,reg1.group(1))
#print(intdob)
if intdob<51:
toret = str(intdob)+'. gs pme'
else:
toret = 'pme'
if toret=='':
return False
return toret
#
def getData():
#for infobox in infoboxlist:
pywikibot.output('\tde wiki women')
# sys.stdout.flush()
begin = time.time()
#result_json = []
query_res = run_query(SQL_main)
end = time.time()
timelen = end-begin
if timelen>30:
pywikibot.output('{}'.format(timelen))
result_json = ['\t'.join(map(str,encode_all_items(f))) for f in query_res]
curr_time = utc_to_local(datetime.utcnow())
dateforq1 = "{0:%Y%m%d%H%M%S}".format(curr_time)
pywikibot.output(dateforq1)
with open('quarry-27936-women-2.txt','w', encoding='utf-8') as fileOpen:
fileOpen.write('\n'.join(result_json))
pywikibot.output('done')
#return result_json
#
getData()