diff --git a/CHANGELOG.md b/CHANGELOG.md index 5fbcf09..159af5b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). +## [0.3.0] - 2023-06-28 + +As of this version, the tool can be installed via `pip` and can be used as a library and not only via the commandline. + +### Added + +- The possibility to install the tool via `pip` ([#7](https://github.com/kbrbe/enrich-authority-csv/issues/7) + +### Changed + +- The import of the lib module is no longer relative, this means to use the tool without installing it one has to set the `PYTHONPATH` environment variable, for example `export PYTHONPATH=/home/youruser/repo/enrich-authority-csv` ## [0.2.0] - 2023-06-23 @@ -31,3 +42,4 @@ Mainly because the script was generalized to handle more than just the ISNI SRU [0.1.0]: https://github.com/kbrbe/enrich-authority-csv/releases/tag/v0.1.0 [0.2.0]: https://github.com/kbrbe/enrich-authority-csv/compare/v0.1.0...v0.2.0 +[0.2.0]: https://github.com/kbrbe/enrich-authority-csv/compare/v0.2.0...v0.3.0 diff --git a/README.md b/README.md index d0ce432..60a9795 100644 --- a/README.md +++ b/README.md @@ -11,7 +11,7 @@ ISNI is the [ISO 27729:2012](https://www.iso.org/standard/44292.html) standard n Given a CSV file where each row is a contributor to creative works, this script uses a specified identifier in one of the columns to fill data gaps in other specified columns based on data available via a specified SRU API. -## Usage +## Usage via the commandline Create and activate a Python virtual environment ```bash @@ -83,8 +83,29 @@ The script will first provide some statistics of how many rows could possibly be by looping over the input file in a streaming fashion. Afterwards the script starts requesting data, progress is shown in a progress bar. +## Usage as a library -### Example output +The tool can also be used as a library within another Python script or a Jupyter notebook. + +```python +from enrich_authority_csv.enrich_authority_csv import main as enrich_authority_csv + +enrich_authority_csv( + configFile='config-example.json', + inputFile='input-file.csv', + outputFile='output-file.csv', + apiName='BnF', + query='aut.isni all', + recordSchema='unimarcxchange', + dataFields={'nationalities': 'nationality'}, + delimiter=',', + secondsBetweenAPIRequests=0, + identifierColumnName='isniIDs') + +``` + + +## Example output ```bash In total, the file contains 299 lines from which 298 contain the identifier to lookup (99.67%) diff --git a/enrich_authority_csv/__init__.py b/enrich_authority_csv/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/enrich_authority_csv/enrich_authority_csv.py b/enrich_authority_csv/enrich_authority_csv.py index 3963075..029fd56 100644 --- a/enrich_authority_csv/enrich_authority_csv.py +++ b/enrich_authority_csv/enrich_authority_csv.py @@ -1,16 +1,15 @@ import os import csv from dotenv import load_dotenv -from config_parser import ConfigParser -import lib +from enrich_authority_csv.config_parser import ConfigParser +import enrich_authority_csv.lib as lib import time from tqdm import tqdm from argparse import ArgumentParser # ----------------------------------------------------------------------------- -def main(): - +def parseArguments(): parser = ArgumentParser(description='This script reads a CSV file and requests for each found lookup identifier (in the column specified with --column-name-lookup-identifier) the datafields specified with --data') parser.add_argument('-i', '--input-file', action='store', required=True, help='A CSV file that contains records about contributors') parser.add_argument('-o', '--output-file', action='store', required=True, help='The CSV file in which the enriched records are stored') @@ -24,24 +23,22 @@ def main(): parser.add_argument('-d', '--delimiter', action='store', default=',', help='The delimiter of the input CSV') args = parser.parse_args() + return args - config = ConfigParser(args.config) - apiName = args.api - query = args.query - recordSchema = args.record_schema +# ----------------------------------------------------------------------------- +def main(configFile, inputFile, outputFile, apiName, query, recordSchema, dataFields, delimiter, secondsBetweenAPIRequests, identifierColumnName): + + + config = ConfigParser(configFile) - dataFields = dict(map(lambda s: s.split('='), args.data)) # check if the requested data can be fetched based on the given API config lib.verifyTask(config, apiName, recordSchema, dataFields) - delimiter = args.delimiter - secondsBetweenAPIRequests = args.wait - identifierColumnName = args.column_name_lookup_identifier - with open(args.input_file, 'r') as inFile, \ - open(args.output_file, 'w') as outFile: + with open(inputFile, 'r') as inFile, \ + open(outputFile, 'w') as outFile: # Count some stats and reset the file pointer afterwards @@ -172,4 +169,7 @@ def main(): print() print(f'{lookupIdentifierName}: No missing values that would have a lookup identifier. So there is nothing to enrich') -main() +if __name__ == '__main__': + args = parseArguments() + dataFields = dict(map(lambda s: s.split('='), args.data)) + main(args.config, args.input_file, args.output_file, args.api, args.query, args.record_schema, dataFields, args.delimiter, args.wait, args.column_name_lookup_identifier) diff --git a/enrich_authority_csv/lib.py b/enrich_authority_csv/lib.py index b470b91..0efd966 100644 --- a/enrich_authority_csv/lib.py +++ b/enrich_authority_csv/lib.py @@ -198,7 +198,7 @@ def initializeCounters(countReader, identifiers, isniColumnName, nationalityColu ... {'kbrIDs':'','ntaIDs':'','isniIDs':'002;003'}, ... {'kbrIDs':'123','ntaIDs':'456','isniIDs':'002;003'}] >>> initializeCounters(rows, {'kbrIDs':'KBR', 'ntaIDs':'NTA'}, 'isniIDs') - {'numberRows': 5, 'numberRowsHaveISNI': 3, 'numberISNIs': 5, 'numberRowsMissingAtLeastOneIdentifier': 4, 'KBR': {'numberMissingIdentifierRows': 3, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 1}, 'NTA': {'numberMissingIdentifierRows': 4, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 2}} + {'numberRows': 5, 'numberRowsHaveISNI': 3, 'numberISNIs': 5, 'numberRowsMissingAtLeastOneIdentifier': 4, 'numberRowsMissingAndPossibleToBeEnriched': 2, 'KBR': {'numberMissingIdentifierRows': 3, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 1, 'numberFoundISNIRows': 0, 'numberFoundISNIs': 0}, 'NTA': {'numberMissingIdentifierRows': 4, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 2, 'numberFoundISNIRows': 0, 'numberFoundISNIs': 0}} """ # initialize counters diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f4ccfff --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +import os +from setuptools import setup + +# Utility function to read the README file. +# Used for the long_description. It's nice, because now 1) we have a top level +# README file and 2) it's easier to type in the README file than to put a raw +# string in below ... +def read(fname): + return open(os.path.join(os.path.dirname(__file__), fname)).read() + +setup( + name = "enrich_authority_csv", + version = "0.3.0", + author = "Sven Lieber", + author_email = "Sven.Lieber@kbr.be", + description = ("A python script that uses SRU APIs to complete a CSV file with missing data based on an available identifier column that can be looked up in the SRU API"), + license = "AGPL-3.0", + keywords = "csv authority-control isni authority-files enriching", + packages=setuptools.find_packages(), + long_description=read('README.md') +)