-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwords_from_dump.py
63 lines (44 loc) · 1.81 KB
/
words_from_dump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
This script take Faroese Wikipedia dump and generates a list of unique words that it contains
"""
import sys
import logging
from mediawiki_dump.dumps import WikipediaDump
from mediawiki_dump.reader import DumpReaderArticles
from mediawiki_dump.tokenizer import clean, tokenize
logging.basicConfig(level=logging.INFO)
def words_from_dump(wiki):
"""
:type wiki str
"""
logger = logging.getLogger('words_from_dump')
logger.info('Processing dump of "%s" wiki...', wiki)
dump = WikipediaDump(wiki)
pages = DumpReaderArticles().read(dump)
long_words = []
# pages = list(pages)[:50] # debug, take only first X pages
for page in pages:
content = page.content
if str(content).startswith('#REDIRECT'):
logger.debug('%s is a redirect, skipping...', page.title)
continue
article_words = tokenize(clean(page.title + ' ' + content))
# make it lower
article_words = [str(word).lower() for word in article_words]
# make it unique and sort it
article_words = sorted(set(article_words))
# add long words (and filter out words with X)
words_from_article = [word for word in article_words if len(word) > 10 and 'x' not in word]
if 'filmsleikstjóririthøvundurframleiðarisjónleikari' in words_from_article:
logger.info('Word found in %s', page.title)
print(content)
long_words += words_from_article
# print('---')
# print(title, content, article_words)
# sort long words
long_words = sorted(set(long_words), key=len, reverse=True)
# show top X
for i, word in enumerate(long_words[:50]):
print('%d %s - %d' % (i+1, word, len(word)))
if __name__ == "__main__":
words_from_dump(wiki=sys.argv[1] if len(sys.argv) > 1 else 'fo')