|
| 1 | +""" |
| 2 | +Process partner bibliographic csv data into importable json book |
| 3 | +records and then batch submit into the ImportBot |
| 4 | +`import_item` table (http://openlibrary.org/admin/imports) |
| 5 | +which queues items to be imported via the |
| 6 | +Open Library JSON import API: https://openlibrary.org/api/import |
| 7 | +""" |
| 8 | + |
| 9 | +import os |
| 10 | +import re |
| 11 | +import sys |
| 12 | +import web |
| 13 | +import datetime |
| 14 | +from datetime import timedelta |
| 15 | +import logging |
| 16 | +import requests |
| 17 | + |
| 18 | +# Add openlibrary into our path so we can process config + batch functions |
| 19 | +from openlibrary.core.imports import Batch |
| 20 | +from infogami import config |
| 21 | +from openlibrary.config import load_config |
| 22 | + |
| 23 | +logger = logging.getLogger("openlibrary.importer.bwb") |
| 24 | + |
| 25 | +SCHEMA_URL = "https://raw.githubusercontent.com/internetarchive" \ |
| 26 | + "/openlibrary-client/master/olclient/schemata/import.schema.json" |
| 27 | + |
| 28 | + |
| 29 | +class Biblio(): |
| 30 | + |
| 31 | + ACTIVE_FIELDS = [ |
| 32 | + 'title', 'isbn_13', 'publish_date', 'publishers', |
| 33 | + 'weight', 'authors', 'lc_classifications', 'pagination', |
| 34 | + 'languages', 'subjects', 'source_records' |
| 35 | + ] |
| 36 | + INACTIVE_FIELDS = [ |
| 37 | + "copyright", "issn", "doi", "lccn", "dewey", "length", |
| 38 | + "width", "height" |
| 39 | + ] |
| 40 | + REQUIRED_FIELDS = requests.get(SCHEMA_URL).json()['required'] |
| 41 | + |
| 42 | + def __init__(self, data): |
| 43 | + self.isbn = data[124] |
| 44 | + self.source_id = 'bwb:%s' % self.isbn |
| 45 | + self.isbn_13 = [self.isbn] |
| 46 | + self.title = data[10] |
| 47 | + self.publish_date = data[20][:4] # YYYY, YYYYMMDD |
| 48 | + self.publishers = [data[135]] |
| 49 | + self.weight = data[39] |
| 50 | + self.authors = self.contributors(data) |
| 51 | + self.lc_classifications = data[147] |
| 52 | + self.pagination = data[36] |
| 53 | + self.languages = [data[37].lower()] |
| 54 | + self.source_records = [self.source_id] |
| 55 | + self.subjects = [ |
| 56 | + s.capitalize().replace('_', ', ') |
| 57 | + for s in data[91:100] |
| 58 | + # + data[101:120] |
| 59 | + # + data[153:158] |
| 60 | + if s |
| 61 | + ] |
| 62 | + |
| 63 | + # Inactive fields |
| 64 | + self.copyright = data[19] |
| 65 | + self.issn = data[54] |
| 66 | + self.doi = data[145] |
| 67 | + self.lccn = data[146] |
| 68 | + self.dewey = data[49] |
| 69 | + # physical_dimensions |
| 70 | + # e.g. "5.4 x 4.7 x 0.2 inches" |
| 71 | + self.length, self.width, self.height = data[40:43] |
| 72 | + |
| 73 | + # Assert importable |
| 74 | + assert self.isbn_13 |
| 75 | + for field in self.REQUIRED_FIELDS: |
| 76 | + assert getattr(self, field) |
| 77 | + |
| 78 | + @staticmethod |
| 79 | + def contributors(data): |
| 80 | + def make_author(name, _, typ): |
| 81 | + author = {'name': name} |
| 82 | + if typ == 'X': |
| 83 | + # set corporate contributor |
| 84 | + author['entity_type'] = 'org' |
| 85 | + # TODO: sort out contributor types |
| 86 | + # AU = author |
| 87 | + # ED = editor |
| 88 | + return author |
| 89 | + |
| 90 | + contributors = ( |
| 91 | + (data[21+i*3], data[22+i*3], data[23+i*3]) for i in range(5) |
| 92 | + ) |
| 93 | + |
| 94 | + # form list of author dicts |
| 95 | + authors = [make_author(*c) for c in contributors if c[0]] |
| 96 | + return authors |
| 97 | + |
| 98 | + def json(self): |
| 99 | + return { |
| 100 | + field: getattr(self, field) |
| 101 | + for field in self.ACTIVE_FIELDS |
| 102 | + if getattr(self, field) |
| 103 | + } |
| 104 | + |
| 105 | + |
| 106 | +def load_state(path, logfile): |
| 107 | + """Retrieves starting point from logfile, if log exists |
| 108 | +
|
| 109 | + Takes as input a path which expands to an ordered candidate list |
| 110 | + of bettworldbks* filenames to process, the location of the |
| 111 | + logfile, and determines which of those files are remaining, as |
| 112 | + well as what our offset is in that file. |
| 113 | +
|
| 114 | + e.g. if we request path containing f1, f2, f3 and our log |
| 115 | + says f2,100 then we start our processing at f2 at the 100th line. |
| 116 | +
|
| 117 | + This assumes the script is being called w/ e.g.: |
| 118 | + /1/var/tmp/imports/2021-08/Bibliographic/*/ |
| 119 | + """ |
| 120 | + filenames = sorted([ |
| 121 | + os.path.join(path, f) |
| 122 | + for f in os.listdir(path) |
| 123 | + if f.startswith("bettworldbks") |
| 124 | + ]) |
| 125 | + try: |
| 126 | + with open(logfile) as fin: |
| 127 | + active_fname, offset = next(fin).strip().split(',') |
| 128 | + unfinished_filenames = filenames[filenames.index(active_fname):] |
| 129 | + return unfinished_filenames, int(offset) |
| 130 | + except (ValueError, OSError): |
| 131 | + return filenames, 0 |
| 132 | + |
| 133 | + |
| 134 | +def update_state(logfile, fname, line_num=0): |
| 135 | + """Records the last file we began processing and the current line""" |
| 136 | + with open(logfile, 'w') as fout: |
| 137 | + fout.write('%s,%s\n' % (fname, line_num)) |
| 138 | + |
| 139 | + |
| 140 | +def csv_to_ol_json_item(line): |
| 141 | + """converts a line to a book item""" |
| 142 | + b = Biblio(line.strip().split('|')) |
| 143 | + return { |
| 144 | + 'ia_id': b.source_id, |
| 145 | + 'data': b.json() |
| 146 | + } |
| 147 | + |
| 148 | + |
| 149 | +def batch_import(path, batch, batch_size=5000): |
| 150 | + logfile = os.path.join(path, 'import.log') |
| 151 | + filenames, offset = load_state(path, logfile) |
| 152 | + |
| 153 | + for fname in filenames: |
| 154 | + book_items = [] |
| 155 | + with open(fname, 'r') as f: |
| 156 | + logger.info("Processing: %s from line %s" % (fname, offset)) |
| 157 | + for line_num, line in enumerate(f): |
| 158 | + |
| 159 | + # skip over already processed records |
| 160 | + if offset: |
| 161 | + if offset > line_num: |
| 162 | + continue |
| 163 | + offset = 0 |
| 164 | + |
| 165 | + try: |
| 166 | + book_items.append(csv_to_ol_json_item(line)) |
| 167 | + except UnicodeDecodeError: |
| 168 | + pass |
| 169 | + |
| 170 | + # If we have enough items, submit a batch |
| 171 | + if not ((line_num + 1) % batch_size): |
| 172 | + batch.add_items(book_items) |
| 173 | + update_state(logfile, fname, line_num) |
| 174 | + book_items = [] # clear added items |
| 175 | + |
| 176 | + # Add any remaining book_items to batch |
| 177 | + if book_items: |
| 178 | + batch.add_items(book_items) |
| 179 | + update_state(logfile, fname, line_num) |
| 180 | + |
| 181 | + |
| 182 | +def main(): |
| 183 | + load_config( |
| 184 | + os.path.abspath(os.path.join( |
| 185 | + os.sep, 'olsystem', 'etc', 'openlibrary.yml'))) |
| 186 | + # Partner data is offset ~15 days from start of month |
| 187 | + date = datetime.date.today() - timedelta(days=15) |
| 188 | + batch_name = "%s-%04d%02d" % ('bwb', date.year, date.month) |
| 189 | + batch = Batch.find(batch_name) or Batch.new(batch_name) |
| 190 | + batch_import(sys.argv[1], batch) |
| 191 | + |
| 192 | + |
| 193 | +if __name__ == '__main__': |
| 194 | + main() |
0 commit comments