-
Notifications
You must be signed in to change notification settings - Fork 8
/
database.py
executable file
·189 lines (166 loc) · 6.65 KB
/
database.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/python
# -*- coding: utf-8 -*-
import re, json, sys, time
import oursql, os
from collections import defaultdict
from urllib import urlopen
updateLog = []
class DB:
"""
Classe para fazer consultas ao banco de dados
"""
def __init__(self):
self.connect()
def connect(self):
self.conn = oursql.connect(db='commonswiki_p', host='commonswiki.labsdb',
read_default_file=os.path.expanduser('~/replica.my.cnf'),
read_timeout=10, charset='utf8', use_unicode=True, autoping=True)
self.cursor = self.conn.cursor()
def query(self, *sql):
"""
Tenta fazer a consulta, reconecta até 10 vezes até conseguir
"""
loops = 0
while True:
try:
self.cursor.execute(*sql)
except (AttributeError, oursql.OperationalError):
if loops < 10:
loops += 1
print 'Erro no DB, esperando %ds antes de tentar de novo' % loops
time.sleep(loops)
else:
self.cursor.execute(*sql)
break
else:
break
def get(self):
return self.cursor.fetchall()
def reData(txt, year):
"""
Parser para linha da configuração
"""
m = re.search(ur'''
\s*wl\["(?P<event>earth|monuments)"\]\[(?P<year>20\d\d)]\ ?=\ ?\{|
\s*\["(?P<country>[-a-z]+)"\]\ =\ \{\["start"\]\ =\ (?P<start>%s\d{10}),\ \["end"\]\ =\ (?P<end>%s\d{10})\}
''' % (year, year), txt, re.X)
return m and m.groupdict()
def getConfig(page):
"""
Lê a configuração da página de configuração no Commons
"""
api = urlopen('https://commons.wikimedia.org/w/api.php?action=query&format=json&prop=revisions&titles=%s&rvprop=content' % page)
text = json.loads(api.read())['query']['pages'].values()[0]['revisions'][0]['*']
data, event, prefixes = {}, None, {}
lines = iter(text.split(u'\n'))
for l in lines:
m = re.search(u'\s*\["(?P<prefix>[\w-]+)"\] = "(?P<name>[\w ]+)"|(?P<close>\})', l)
if prefixes and m and m.group('close'):
break
elif m and m.group('prefix'):
prefixes[m.group('prefix')] = m.group('name')
for l in lines:
g = reData(l, event[-4:] if event else ur'20\d\d')
if not g:
continue
if g['event']:
event = g['event'] + g['year']
data[event] = {}
elif g['country'] and event:
if g['country'] not in prefixes:
updateLog.append(u'Unknown prefix: ' + g['country'])
continue
data[event][prefixes[g['country']]] = {'start': int(g['start']), 'end': int(g['end'])}
return {name: config for name, config in data.items() if config}
catExceptions = {
u'Netherlands': u'the_Netherlands',
u'Czech Republic': u'the_Czech_Republic',
u'Philippines': u'the_Philippines',
u'United Kingdom': u'the_United_Kingdom',
u'United States': u'the_United_States'
}
dbquery = u'''SELECT
img_timestamp,
img_name IN (SELECT DISTINCT gil_to FROM globalimagelinks),
user_name,
user_registration
FROM (SELECT
cl_to,
cl_from
FROM categorylinks
WHERE cl_to = ? AND cl_type = 'file') cats
INNER JOIN page ON cl_from = page_id
INNER JOIN image ON page_title = img_name
INNER JOIN user ON img_user = user_id'''
def getData(name, data):
"""
Coleta dados do banco de dados e processa
"""
category = u'Images_from_Wiki_Loves_%s_%s_in_' % \
(name[0:-4].capitalize(), name[-4:])
starttime = min(data[c]['start'] for c in data if 'start' in data[c])
endtime = max(data[c]['end'] for c in data if 'end' in data[c])
for country in data.keys():
if country[0].islower():
updateLog.append(u'')
cat = category + catExceptions.get(country, country.replace(' ', u'_'))
if name == 'monuments2010':
cat = u'Images_from_Wiki_Loves_Monuments_2010'
commonsdb.query(dbquery, (cat,))
dbData = tuple(
(int(timestamp),
bool(usage),
user.decode('utf-8'),
int(user_reg or 0))
for timestamp, usage, user, user_reg in commonsdb.get())
if not dbData:
updateLog.append(u'%s in %s is configurated, but no file was found in [[Category:%s]]' %
(name, country, cat.replace(u'_', u' ')))
del data[country]
continue
cData = {'starttime': data[country].get('start', starttime),
'endtime': data[country].get('end', endtime),
'data': defaultdict(int), # data: {timestamp_day0: n, timestamp_day1: n,...}
'users': {}} # users: {'user1': {'count': n, 'usage': n, 'reg': timestamp},...}
for timestamp, usage, user, user_reg in dbData:
# Desconsidera timestamps fora do período da campanha
if not cData['starttime'] <= timestamp <= cData['endtime']:
continue
# Conta imagens por dia
cData['data'][str(timestamp)[0:8]] += 1
if user not in cData['users']:
cData['users'][user] = {'count': 0, 'usage': 0, 'reg': user_reg}
cData['users'][user]['count'] += 1
if usage:
cData['users'][user]['usage'] += 1
data.setdefault(country, {}).update(
{'data': cData['data'], 'users': cData['users']})
data[country]['usercount'] = len(cData['users'])
data[country]['count'] = sum(u['count'] for u in cData['users'].itervalues())
data[country]['usage'] = sum(u['usage'] for u in cData['users'].itervalues())
data[country]['userreg'] = sum(1 for u in cData['users'].itervalues() if u['reg'] > cData['starttime']) \
if 'starttime' in cData else 0
data[country]['category'] = cat
return data
if __name__ == '__main__' and 'update' in sys.argv:
config = getConfig(u'Module:WL_data')
try:
with open('db.json', 'r') as f:
db = json.load(f)
except Exception as e:
print u'Erro ao abrir db.json:', repr(e)
db = {}
commonsdb = DB()
for WL in config:
start = time.time()
db[WL] = getData(WL, config[WL])
with open('db.json', 'w') as f:
json.dump(db, f)
log = 'Saved %s: %dsec, %d countries, %d uploads' % \
(WL, time.time() - start, len(db[WL]), sum(db[WL][c].get('count', 0) for c in db[WL]))
print log
updateLog.append(log)
commonsdb.conn.close()
if updateLog:
with open('update.log', 'w') as f:
f.write(time.strftime('%Y%m%d%H%M%S') + '\n' + '\n'.join(updateLog))