Skip to content

Commit bafa129

Browse files
authored
Merge pull request #5687 from internetarchive/feature/handsfree-partner-batch-imports
Adds hands-free partner import runner
2 parents 18e9da0 + d0c75ee commit bafa129

File tree

2 files changed

+215
-0
lines changed

2 files changed

+215
-0
lines changed

scripts/partner_batch_imports.py

Lines changed: 194 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,194 @@
1+
"""
2+
Process partner bibliographic csv data into importable json book
3+
records and then batch submit into the ImportBot
4+
`import_item` table (http://openlibrary.org/admin/imports)
5+
which queues items to be imported via the
6+
Open Library JSON import API: https://openlibrary.org/api/import
7+
"""
8+
9+
import os
10+
import re
11+
import sys
12+
import web
13+
import datetime
14+
from datetime import timedelta
15+
import logging
16+
import requests
17+
18+
# Add openlibrary into our path so we can process config + batch functions
19+
from openlibrary.core.imports import Batch
20+
from infogami import config
21+
from openlibrary.config import load_config
22+
23+
logger = logging.getLogger("openlibrary.importer.bwb")
24+
25+
SCHEMA_URL = "https://raw.githubusercontent.com/internetarchive" \
26+
"/openlibrary-client/master/olclient/schemata/import.schema.json"
27+
28+
29+
class Biblio():
30+
31+
ACTIVE_FIELDS = [
32+
'title', 'isbn_13', 'publish_date', 'publishers',
33+
'weight', 'authors', 'lc_classifications', 'pagination',
34+
'languages', 'subjects', 'source_records'
35+
]
36+
INACTIVE_FIELDS = [
37+
"copyright", "issn", "doi", "lccn", "dewey", "length",
38+
"width", "height"
39+
]
40+
REQUIRED_FIELDS = requests.get(SCHEMA_URL).json()['required']
41+
42+
def __init__(self, data):
43+
self.isbn = data[124]
44+
self.source_id = 'bwb:%s' % self.isbn
45+
self.isbn_13 = [self.isbn]
46+
self.title = data[10]
47+
self.publish_date = data[20][:4] # YYYY, YYYYMMDD
48+
self.publishers = [data[135]]
49+
self.weight = data[39]
50+
self.authors = self.contributors(data)
51+
self.lc_classifications = data[147]
52+
self.pagination = data[36]
53+
self.languages = [data[37].lower()]
54+
self.source_records = [self.source_id]
55+
self.subjects = [
56+
s.capitalize().replace('_', ', ')
57+
for s in data[91:100]
58+
# + data[101:120]
59+
# + data[153:158]
60+
if s
61+
]
62+
63+
# Inactive fields
64+
self.copyright = data[19]
65+
self.issn = data[54]
66+
self.doi = data[145]
67+
self.lccn = data[146]
68+
self.dewey = data[49]
69+
# physical_dimensions
70+
# e.g. "5.4 x 4.7 x 0.2 inches"
71+
self.length, self.width, self.height = data[40:43]
72+
73+
# Assert importable
74+
assert self.isbn_13
75+
for field in self.REQUIRED_FIELDS:
76+
assert getattr(self, field)
77+
78+
@staticmethod
79+
def contributors(data):
80+
def make_author(name, _, typ):
81+
author = {'name': name}
82+
if typ == 'X':
83+
# set corporate contributor
84+
author['entity_type'] = 'org'
85+
# TODO: sort out contributor types
86+
# AU = author
87+
# ED = editor
88+
return author
89+
90+
contributors = (
91+
(data[21+i*3], data[22+i*3], data[23+i*3]) for i in range(5)
92+
)
93+
94+
# form list of author dicts
95+
authors = [make_author(*c) for c in contributors if c[0]]
96+
return authors
97+
98+
def json(self):
99+
return {
100+
field: getattr(self, field)
101+
for field in self.ACTIVE_FIELDS
102+
if getattr(self, field)
103+
}
104+
105+
106+
def load_state(path, logfile):
107+
"""Retrieves starting point from logfile, if log exists
108+
109+
Takes as input a path which expands to an ordered candidate list
110+
of bettworldbks* filenames to process, the location of the
111+
logfile, and determines which of those files are remaining, as
112+
well as what our offset is in that file.
113+
114+
e.g. if we request path containing f1, f2, f3 and our log
115+
says f2,100 then we start our processing at f2 at the 100th line.
116+
117+
This assumes the script is being called w/ e.g.:
118+
/1/var/tmp/imports/2021-08/Bibliographic/*/
119+
"""
120+
filenames = sorted([
121+
os.path.join(path, f)
122+
for f in os.listdir(path)
123+
if f.startswith("bettworldbks")
124+
])
125+
try:
126+
with open(logfile) as fin:
127+
active_fname, offset = next(fin).strip().split(',')
128+
unfinished_filenames = filenames[filenames.index(active_fname):]
129+
return unfinished_filenames, int(offset)
130+
except (ValueError, OSError):
131+
return filenames, 0
132+
133+
134+
def update_state(logfile, fname, line_num=0):
135+
"""Records the last file we began processing and the current line"""
136+
with open(logfile, 'w') as fout:
137+
fout.write('%s,%s\n' % (fname, line_num))
138+
139+
140+
def csv_to_ol_json_item(line):
141+
"""converts a line to a book item"""
142+
b = Biblio(line.strip().split('|'))
143+
return {
144+
'ia_id': b.source_id,
145+
'data': b.json()
146+
}
147+
148+
149+
def batch_import(path, batch, batch_size=5000):
150+
logfile = os.path.join(path, 'import.log')
151+
filenames, offset = load_state(path, logfile)
152+
153+
for fname in filenames:
154+
book_items = []
155+
with open(fname, 'r') as f:
156+
logger.info("Processing: %s from line %s" % (fname, offset))
157+
for line_num, line in enumerate(f):
158+
159+
# skip over already processed records
160+
if offset:
161+
if offset > line_num:
162+
continue
163+
offset = 0
164+
165+
try:
166+
book_items.append(csv_to_ol_json_item(line))
167+
except UnicodeDecodeError:
168+
pass
169+
170+
# If we have enough items, submit a batch
171+
if not ((line_num + 1) % batch_size):
172+
batch.add_items(book_items)
173+
update_state(logfile, fname, line_num)
174+
book_items = [] # clear added items
175+
176+
# Add any remaining book_items to batch
177+
if book_items:
178+
batch.add_items(book_items)
179+
update_state(logfile, fname, line_num)
180+
181+
182+
def main():
183+
load_config(
184+
os.path.abspath(os.path.join(
185+
os.sep, 'olsystem', 'etc', 'openlibrary.yml')))
186+
# Partner data is offset ~15 days from start of month
187+
date = datetime.date.today() - timedelta(days=15)
188+
batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month)
189+
batch = Batch.find(batch_name) or Batch.new(batch_name)
190+
batch_import(sys.argv[1], batch)
191+
192+
193+
if __name__ == '__main__':
194+
main()
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
from ..partner_batch_imports import Biblio
2+
3+
csv_row = "USA01961304|0962561851||9780962561856|AC|I|TC||B||Sutra on Upasaka Precepts|The||||||||2006|20060531|Heng-ching, Shih|TR||||||||||||||226|ENG||0.545|22.860|15.240|||||||P|||||||74474||||||27181|USD|30.00||||||||||||||||||||||||||||SUTRAS|BUDDHISM_SACRED BOOKS|||||||||REL007030|REL032000|||||||||HRES|HRG|||||||||RB,BIP,MIR,SYN|1961304|00|9780962561856|67499962||PRN|75422798|||||||BDK America||1||||||||10.1604/9780962561856|91-060120||20060531|||||REL007030||||||" # noqa: E501
4+
5+
6+
class TestBiblio:
7+
def test_sample_csv_row(self):
8+
b = Biblio(csv_row.strip().split('|'))
9+
data = {
10+
'title': 'Sutra on Upasaka Precepts',
11+
'isbn_13': ['9780962561856'],
12+
'publish_date': '2006',
13+
'publishers': ['BDK America'],
14+
'weight': '0.545',
15+
'authors': [{'name': 'Heng-ching, Shih'}],
16+
'pagination': '226',
17+
'languages': ['eng'],
18+
'subjects': ['Sutras', 'Buddhism, sacred books'],
19+
'source_records': ['bwb:9780962561856']
20+
}
21+
assert b.json() == data

0 commit comments

Comments
 (0)