-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathindexer
96 lines (77 loc) · 2.78 KB
/
indexer
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env python
import xml.etree.ElementTree as etree
import requests
import json
import os
BASEDIR='/media/data/ebooks'
namespaces = {'dc': 'http://purl.org/dc/elements/1.1/',
'opf': 'http://www.idpf.org/2007/opf'}
def get_mobi_file(files):
"""returns mobi file, if there is one in the file list"""
for f in files:
if f.endswith('.mobi'):
return f
return ''
def get_metadata(basedir):
"""walks basedir and yields relevant metadata, path, cover and mobi"""
for root, dirs, files in os.walk(basedir):
if 'metadata.opf' in files:
path = '/'.join(root.split('/')[-2:])
mobi = get_mobi_file(files)
cover = ''
if 'cover.jpg' in files:
cover = 'cover.jpg'
yield ('%s/metadata.opf' % root, path, cover, mobi)
def parse_metadata(metadata):
"""parse metadata and returns fields"""
x = etree.parse(metadata)
root = x.getroot()
def get_field(matcher):
match = root.find('./opf:metadata/dc:%s' % matcher, namespaces=namespaces)
if match is None:
return ''
return match.text
id_ = get_field('identifier[@id="calibre_id"]')
if not id_:
return None
author = get_field('creator')
language = get_field('language')
title = get_field('title')
description = get_field('description')
return {'id': int(id_),
'author': author,
'language': language,
'title': title,
'description': description,
}
def update_entry(ebook_data):
"""updates the solr entry"""
solr_data = {'add': {
'doc': ebook_data,
},
}
headers = {'Content-type': 'application/json', 'Accept': 'application/json'}
requests.post('http://localhost:8983/ebooks/update',
data=json.dumps(solr_data), headers=headers)
def reload_core():
"""tells solr to reload the core"""
payload = {'wt': 'json', 'action': 'RELOAD', 'core': 'ebooks'}
requests.get('http://leierkasten.local:8983/admin/cores', params=payload)
if __name__ == '__main__':
for idx, metadata in enumerate(get_metadata(BASEDIR)):
metadata_file, path, cover, mobi = metadata
ebook_data = parse_metadata(metadata_file)
if not ebook_data:
print("Unable to find metadata in %s." % metadata_file)
continue
ebook_data.update({'path': path,
'cover': cover,
'mobi': mobi,
})
update_entry(ebook_data)
if idx > 0 and idx % 100 == 0:
print("Added %s entries" % idx)
print('Finally reloading solr')
reload_core()
print('Done')
# vim: set tabstop=4 shiftwidth=4 expandtab: