Skip to content

Commit

Permalink
Merge pull request #14 from Senzing/issue-3.dockter.1
Browse files Browse the repository at this point in the history
Shipped with SenzingAPI 2.2.1
  • Loading branch information
docktermj authored Jun 7, 2021
2 parents a93f238 + f591140 commit ef8791c
Show file tree
Hide file tree
Showing 2 changed files with 127 additions and 59 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
[markdownlint](https://dlaa.me/markdownlint/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [1.10.0] - 2020-09-30

### Added to 1.10.0

- Shipped with SenzingAPI 2.2.1

## [1.9.0] - 2020-09-03

### Added to 1.9.0
Expand Down
180 changes: 121 additions & 59 deletions G2Project.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,18 @@
#! /usr/bin/env python3

#--python imports
import optparse
import sys
import os
import json
import csv
import glob
import fnmatch
import glob
import json
import optparse
import os
import sys
import textwrap
from operator import itemgetter

#--project classes
from G2Exception import G2UnsupportedFileTypeException
from G2Exception import G2InvalidFileTypeContentsException
from CompressedFile import openPossiblyCompressedFile, fileRowParser
from CompressedFile import fileRowParser, openPossiblyCompressedFile
from G2Exception import (G2InvalidFileTypeContentsException,
G2UnsupportedFileTypeException)
from G2S3 import G2S3

try: from dateutil.parser import parse as dateParser
Expand Down Expand Up @@ -143,6 +142,33 @@ def mapAttribute(self, attrName, attrValue):

return attrMapping


def listHasLowerCaseKeys(self,listCollection):
hasLowerCaseKeys = False
for item in listCollection:
if type(item) is dict:
hasLowerCaseKeys = hasLowerCaseKeys or self.dictHasLowerCaseKeys(item)
elif type(item) is list:
hasLowerCaseKeys = hasLowerCaseKeys or self.listHasLowerCaseKeys(item)
return hasLowerCaseKeys


def dictHasLowerCaseKeys(self,dictCollection):
hasLowerCaseKeys = False
for dictKey, dictValue in dictCollection.items():
for c in dictKey:
if c.islower():
hasLowerCaseKeys = True
if type(dictValue) is dict:
hasLowerCaseKeys = hasLowerCaseKeys or self.dictHasLowerCaseKeys(dictValue)
elif type(dictValue) is list:
hasLowerCaseKeys = hasLowerCaseKeys or self.listHasLowerCaseKeys(dictValue)
return hasLowerCaseKeys


def recordHasLowerCaseKeys(self,jsonRecord):
return self.dictHasLowerCaseKeys(jsonRecord)

#----------------------------------------
def mapJsonRecord(self, jsonDict):
'''checks for mapping errors'''
Expand Down Expand Up @@ -380,52 +406,79 @@ def featureToJson(self, featureList):
#----------------------------------------
def loadProjectUri(self, fileSpec):
''' creates a project dictionary from a file spec '''
parmDict = {}
if '/?' in fileSpec:
parmString = fileSpec.split('/?')[1]
fileSpec = fileSpec.split('/?')[0]
parmList = parmString.split(',')
for parm in parmList:
if '=' in parm:
parmType = parm.split('=')[0].strip().upper()
parmValue = parm.split('=')[1].strip().replace('"','').replace("'",'').upper()
parmDict[parmType] = parmValue

#--try to determine file_format
if 'FILE_FORMAT' not in parmDict:
if 'FILE_TYPE' in parmDict:
parmDict['FILE_FORMAT'] = parmDict['FILE_TYPE']
else:
dummy, fileExtension = os.path.splitext(fileSpec)
parmDict['FILE_FORMAT'] = fileExtension.replace('.','').upper()

if parmDict['FILE_FORMAT'] not in ('JSON', 'CSV', 'UMF', 'TAB', 'TSV', 'PIPE'):
print('ERROR: File format must be either JSON, CSV UMF, TAB, TSV or PIPE to use the file specification!')
self.success = False
else:
if G2S3.isS3Uri(fileSpec):
s3list = G2S3.ListOfS3UrisOfFilesInBucket(fileSpec, os.path.dirname(G2S3.getFilePathFromUri(fileSpec)))
fileList = fnmatch.filter(s3list, fileSpec)
for file in fileSpec:

parmDict = {}
parmString = ''
parmList = []
# Have additional parameters been specified?
if '/?' in file:
# Split what we are expecting and anything else discard
fileSpec, parmString, *_ = file.split('/?')
parmList = parmString.split(',')

for parm in parmList:
if '=' in parm:
parmType = parm.split('=')[0].strip().upper()
parmValue = parm.split('=')[1].strip().replace('"','').replace("'",'').upper()
parmDict[parmType] = parmValue
# If not additional parameters use file to enable easy file globbing where fileSpec would otherwise be a list and file isn't
# fileSpec is a str when /? is present but a list when it isn't present this addresses that
else:
if fileSpec.upper().startswith('FILE://'):
fileSpec = fileSpec[7:]
try: fileList = glob.glob(fileSpec)
except: fileList = []
if not fileList:
print('ERROR: file specification did not return any files!')
fileSpec = file

#--try to determine file_format
if 'FILE_FORMAT' not in parmDict:
if 'FILE_TYPE' in parmDict:
parmDict['FILE_FORMAT'] = parmDict['FILE_TYPE']
else:
_, fileExtension = os.path.splitext(fileSpec)
parmDict['FILE_FORMAT'] = fileExtension.replace('.','').upper()

if parmDict['FILE_FORMAT'] not in ('JSON', 'CSV', 'UMF', 'TAB', 'TSV', 'PIPE'):
print(textwrap.dedent(f'''\n
ERROR: File format must be one of JSON, CSV, UMF, TAB, TSV, PIPE or specify file_format with the -f argument.
- ./G2Loader.py -f my_file.csv/?data_source=EXAMPLE
- ./G2Loader.py -f my_file.txt/?data_source=EXAMPLE,file_format=CSV
- If using a wildcard such as -f files_to_load* all files must have the same extension or use file_format=<format>
- ./G2Loader.py -f my_file*.csv/?data_source=EXAMPLE
- ./G2Loader.py -f my_file*/?data_source=EXAMPLE,file_format=CSV
- File format detected: {parmDict['FILE_FORMAT'] if parmDict['FILE_FORMAT'] else 'None'}
'''))
self.success = False
else:
self.projectFileName = 'n/a'
self.projectFilePath = os.path.dirname(os.path.abspath(fileList[0]))
for fileName in fileList:
sourceDict = {}
sourceDict['FILE_NAME'] = fileName
sourceDict['FILE_FORMAT'] = parmDict['FILE_FORMAT']
if 'DATA_SOURCE' in parmDict:
sourceDict['DATA_SOURCE'] = parmDict['DATA_SOURCE']
self.projectSourceList.append(sourceDict)

self.prepareSourceFiles()
if G2S3.isS3Uri(fileSpec):
s3list = G2S3.ListOfS3UrisOfFilesInBucket(fileSpec, os.path.dirname(G2S3.getFilePathFromUri(fileSpec)))
fileList = fnmatch.filter(s3list, fileSpec)
else:
if fileSpec.upper().startswith('FILE://'):
fileSpec = fileSpec[7:]
try:
fileList = glob.glob(fileSpec)
except:
fileList = []

if not fileList:
print('ERROR: File specification did not return any files!')
self.success = False
else:
self.projectFileName = 'n/a'
self.projectFilePath = os.path.dirname(os.path.abspath(fileList[0]))
for fileName in fileList:
sourceDict = {}
sourceDict['FILE_NAME'] = fileName
sourceDict['FILE_FORMAT'] = parmDict['FILE_FORMAT']
if 'DATA_SOURCE' in parmDict:
sourceDict['DATA_SOURCE'] = parmDict['DATA_SOURCE']
self.projectSourceList.append(sourceDict)

if self.success:
self.prepareSourceFiles()

return

Expand Down Expand Up @@ -505,41 +558,48 @@ def loadJsonProject(self):

#----------------------------------------
def loadCsvProject(self):
fileData = {}
fileData['FILE_NAME'] = self.projectFileName
fileData['FILE_FORMAT'] = self.projectFileFormat

fileData = {
'FILE_NAME': self.projectFileName,
'FILE_FORMAT': self.projectFileFormat
}

if self.projectFileFormat == 'CSV':
fileData['DELIMITER'] = ','
elif self.projectFileFormat in ('TSV', 'TAB'):
fileData['DELIMITER'] = '\t'
elif self.projectFileFormat == 'PIPE':
fileData['DELIMITER'] = '|'

fileData['MULTICHAR_DELIMITER'] = False

csvFile = openPossiblyCompressedFile(self.projectFileName, 'r')
fileData['HEADER_ROW'] = [x.strip().upper() for x in fileRowParser(next(csvFile), fileData)]

if not(fileData['HEADER_ROW']):
print('ERROR: project file does not contain a header row!')
self.success = False
elif not 'FILE_NAME' in fileData['HEADER_ROW']:
print('ERROR: project file does not contain a column for FILE_NAME!')
self.success = False
else:

for line in csvFile:
rowData = fileRowParser(line, fileData)
if rowData: #--skip blank lines
self.projectSourceList.append(rowData)

csvFile.close()

return

#----------------------------------------
def prepareSourceFiles(self):
''' ensure project files referenced exist and are valid '''
print('')

print()
self.sourceList = []
sourceRow = 0

for sourceDict in self.projectSourceList:
sourceRow += 1

Expand Down Expand Up @@ -571,6 +631,7 @@ def prepareSourceFiles(self):
#--csv stuff
sourceDict['ENCODING'] = sourceDict['ENCODING'] if 'ENCODING' in sourceDict else 'utf-8-sig'
sourceDict['DELIMITER'] = sourceDict['DELIMITER'] if 'DELIMITER' in sourceDict else None

if not sourceDict['DELIMITER']:
if sourceDict['FILE_FORMAT'] == 'CSV':
sourceDict['DELIMITER'] = ','
Expand Down Expand Up @@ -693,7 +754,9 @@ def prepareSourceFiles(self):

fileReader.close()

self.sourceList.append(sourceDict)
if self.success:
self.sourceList.append(sourceDict)

return

#----------------------------------------
Expand Down Expand Up @@ -838,4 +901,3 @@ def compositeKeyBuilder(rowData, keyList):
del myProject

sys.exit()

0 comments on commit ef8791c

Please sign in to comment.