Skip to content

Commit

Permalink
Merge pull request #11 from kbrbe/development
Browse files Browse the repository at this point in the history
New features for version 0.3.0
  • Loading branch information
SvenLieber authored Jun 28, 2023
2 parents 319bde9 + 795bb21 commit 1fa9f19
Showing 6 changed files with 72 additions and 18 deletions.
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -5,6 +5,17 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.3.0] - 2023-06-28

As of this version, the tool can be installed via `pip` and can be used as a library and not only via the commandline.

### Added

- The possibility to install the tool via `pip` ([#7](https://github.com/kbrbe/enrich-authority-csv/issues/7)

### Changed

- The import of the lib module is no longer relative, this means to use the tool without installing it one has to set the `PYTHONPATH` environment variable, for example `export PYTHONPATH=/home/youruser/repo/enrich-authority-csv`

## [0.2.0] - 2023-06-23

@@ -31,3 +42,4 @@ Mainly because the script was generalized to handle more than just the ISNI SRU

[0.1.0]: https://github.com/kbrbe/enrich-authority-csv/releases/tag/v0.1.0
[0.2.0]: https://github.com/kbrbe/enrich-authority-csv/compare/v0.1.0...v0.2.0
[0.2.0]: https://github.com/kbrbe/enrich-authority-csv/compare/v0.2.0...v0.3.0
25 changes: 23 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -11,7 +11,7 @@ ISNI is the [ISO 27729:2012](https://www.iso.org/standard/44292.html) standard n
Given a CSV file where each row is a contributor to creative works, this script uses a specified identifier in one of the columns to
fill data gaps in other specified columns based on data available via a specified SRU API.

## Usage
## Usage via the commandline

Create and activate a Python virtual environment
```bash
@@ -83,8 +83,29 @@ The script will first provide some statistics of how many rows could possibly be
by looping over the input file in a streaming fashion.
Afterwards the script starts requesting data, progress is shown in a progress bar.

## Usage as a library

### Example output
The tool can also be used as a library within another Python script or a Jupyter notebook.

```python
from enrich_authority_csv.enrich_authority_csv import main as enrich_authority_csv

enrich_authority_csv(
configFile='config-example.json',
inputFile='input-file.csv',
outputFile='output-file.csv',
apiName='BnF',
query='aut.isni all',
recordSchema='unimarcxchange',
dataFields={'nationalities': 'nationality'},
delimiter=',',
secondsBetweenAPIRequests=0,
identifierColumnName='isniIDs')

```


## Example output

```bash
In total, the file contains 299 lines from which 298 contain the identifier to lookup (99.67%)
Empty file.
30 changes: 15 additions & 15 deletions enrich_authority_csv/enrich_authority_csv.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import os
import csv
from dotenv import load_dotenv
from config_parser import ConfigParser
import lib
from enrich_authority_csv.config_parser import ConfigParser
import enrich_authority_csv.lib as lib
import time
from tqdm import tqdm
from argparse import ArgumentParser


# -----------------------------------------------------------------------------
def main():

def parseArguments():
parser = ArgumentParser(description='This script reads a CSV file and requests for each found lookup identifier (in the column specified with --column-name-lookup-identifier) the datafields specified with --data')
parser.add_argument('-i', '--input-file', action='store', required=True, help='A CSV file that contains records about contributors')
parser.add_argument('-o', '--output-file', action='store', required=True, help='The CSV file in which the enriched records are stored')
@@ -24,24 +23,22 @@ def main():
parser.add_argument('-d', '--delimiter', action='store', default=',', help='The delimiter of the input CSV')
args = parser.parse_args()

return args

config = ConfigParser(args.config)
apiName = args.api
query = args.query
recordSchema = args.record_schema
# -----------------------------------------------------------------------------
def main(configFile, inputFile, outputFile, apiName, query, recordSchema, dataFields, delimiter, secondsBetweenAPIRequests, identifierColumnName):


config = ConfigParser(configFile)

dataFields = dict(map(lambda s: s.split('='), args.data))

# check if the requested data can be fetched based on the given API config
lib.verifyTask(config, apiName, recordSchema, dataFields)


delimiter = args.delimiter
secondsBetweenAPIRequests = args.wait
identifierColumnName = args.column_name_lookup_identifier

with open(args.input_file, 'r') as inFile, \
open(args.output_file, 'w') as outFile:
with open(inputFile, 'r') as inFile, \
open(outputFile, 'w') as outFile:


# Count some stats and reset the file pointer afterwards
@@ -172,4 +169,7 @@ def main():
print()
print(f'{lookupIdentifierName}: No missing values that would have a lookup identifier. So there is nothing to enrich')

main()
if __name__ == '__main__':
args = parseArguments()
dataFields = dict(map(lambda s: s.split('='), args.data))
main(args.config, args.input_file, args.output_file, args.api, args.query, args.record_schema, dataFields, args.delimiter, args.wait, args.column_name_lookup_identifier)
2 changes: 1 addition & 1 deletion enrich_authority_csv/lib.py
Original file line number Diff line number Diff line change
@@ -198,7 +198,7 @@ def initializeCounters(countReader, identifiers, isniColumnName, nationalityColu
... {'kbrIDs':'','ntaIDs':'','isniIDs':'002;003'},
... {'kbrIDs':'123','ntaIDs':'456','isniIDs':'002;003'}]
>>> initializeCounters(rows, {'kbrIDs':'KBR', 'ntaIDs':'NTA'}, 'isniIDs')
{'numberRows': 5, 'numberRowsHaveISNI': 3, 'numberISNIs': 5, 'numberRowsMissingAtLeastOneIdentifier': 4, 'KBR': {'numberMissingIdentifierRows': 3, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 1}, 'NTA': {'numberMissingIdentifierRows': 4, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 2}}
{'numberRows': 5, 'numberRowsHaveISNI': 3, 'numberISNIs': 5, 'numberRowsMissingAtLeastOneIdentifier': 4, 'numberRowsMissingAndPossibleToBeEnriched': 2, 'KBR': {'numberMissingIdentifierRows': 3, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 1, 'numberFoundISNIRows': 0, 'numberFoundISNIs': 0}, 'NTA': {'numberMissingIdentifierRows': 4, 'numberISNIs': 5, 'numberRowsToBeEnrichedHaveISNI': 2, 'numberRowsThatCannotBeEnriched': 2, 'numberFoundISNIRows': 0, 'numberFoundISNIs': 0}}
"""

# initialize counters
21 changes: 21 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import os
from setuptools import setup

# Utility function to read the README file.
# Used for the long_description. It's nice, because now 1) we have a top level
# README file and 2) it's easier to type in the README file than to put a raw
# string in below ...
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()

setup(
name = "enrich_authority_csv",
version = "0.3.0",
author = "Sven Lieber",
author_email = "Sven.Lieber@kbr.be",
description = ("A python script that uses SRU APIs to complete a CSV file with missing data based on an available identifier column that can be looked up in the SRU API"),
license = "AGPL-3.0",
keywords = "csv authority-control isni authority-files enriching",
packages=setuptools.find_packages(),
long_description=read('README.md')
)

0 comments on commit 1fa9f19

Please sign in to comment.