Merge pull request #14 from Senzing/issue-3.dockter.1

Shipped with SenzingAPI 2.2.1
senzing-garage · Jun 7, 2021 · ef8791c · ef8791c
2 parents a93f238 + f591140
commit ef8791c
Show file tree

Hide file tree

Showing 2 changed files with 127 additions and 59 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,12 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 [markdownlint](https://dlaa.me/markdownlint/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.10.0] - 2020-09-30
+
+### Added to 1.10.0
+
+- Shipped with SenzingAPI 2.2.1
+
 ## [1.9.0] - 2020-09-03
 
 ### Added to 1.9.0

diff --git a/G2Project.py b/G2Project.py
@@ -1,19 +1,18 @@
 #! /usr/bin/env python3
 
-#--python imports
-import optparse
-import sys
-import os
-import json
 import csv
-import glob
 import fnmatch
+import glob
+import json
+import optparse
+import os
+import sys
+import textwrap
 from operator import itemgetter
 
-#--project classes
-from G2Exception import G2UnsupportedFileTypeException
-from G2Exception import G2InvalidFileTypeContentsException
-from CompressedFile import openPossiblyCompressedFile, fileRowParser
+from CompressedFile import fileRowParser, openPossiblyCompressedFile
+from G2Exception import (G2InvalidFileTypeContentsException,
+                         G2UnsupportedFileTypeException)
 from G2S3 import G2S3
 
 try: from dateutil.parser import parse as dateParser
@@ -143,6 +142,33 @@ def mapAttribute(self, attrName, attrValue):
 
         return attrMapping
 
+
+    def listHasLowerCaseKeys(self,listCollection):
+        hasLowerCaseKeys = False
+        for item in listCollection:
+            if type(item) is dict:
+                hasLowerCaseKeys = hasLowerCaseKeys or self.dictHasLowerCaseKeys(item)
+            elif type(item) is list:
+                hasLowerCaseKeys = hasLowerCaseKeys or self.listHasLowerCaseKeys(item)
+        return hasLowerCaseKeys
+
+
+    def dictHasLowerCaseKeys(self,dictCollection):
+        hasLowerCaseKeys = False
+        for dictKey, dictValue in dictCollection.items():
+            for c in dictKey:
+                if c.islower():
+                    hasLowerCaseKeys = True
+            if type(dictValue) is dict:
+                hasLowerCaseKeys = hasLowerCaseKeys or self.dictHasLowerCaseKeys(dictValue)
+            elif type(dictValue) is list:
+                hasLowerCaseKeys = hasLowerCaseKeys or self.listHasLowerCaseKeys(dictValue)
+        return hasLowerCaseKeys
+
+
+    def recordHasLowerCaseKeys(self,jsonRecord):
+        return self.dictHasLowerCaseKeys(jsonRecord)
+
     #----------------------------------------
     def mapJsonRecord(self, jsonDict):
         '''checks for mapping errors'''
@@ -380,52 +406,79 @@ def featureToJson(self, featureList):
     #----------------------------------------
     def loadProjectUri(self, fileSpec):
         ''' creates a project dictionary from a file spec '''
-        parmDict = {}
-        if '/?' in fileSpec:
-            parmString = fileSpec.split('/?')[1]
-            fileSpec = fileSpec.split('/?')[0]
-            parmList = parmString.split(',')
-            for parm in parmList:
-                if '=' in parm:
-                    parmType = parm.split('=')[0].strip().upper()
-                    parmValue = parm.split('=')[1].strip().replace('"','').replace("'",'').upper()
-                    parmDict[parmType] = parmValue
-
-        #--try to determine file_format
-        if 'FILE_FORMAT' not in parmDict:
-            if 'FILE_TYPE' in parmDict:
-                parmDict['FILE_FORMAT'] = parmDict['FILE_TYPE']
-            else:
-                dummy, fileExtension = os.path.splitext(fileSpec)
-                parmDict['FILE_FORMAT'] = fileExtension.replace('.','').upper()
 
-        if parmDict['FILE_FORMAT'] not in ('JSON', 'CSV', 'UMF', 'TAB', 'TSV', 'PIPE'):
-            print('ERROR: File format must be either JSON, CSV UMF, TAB, TSV or PIPE to use the file specification!')
-            self.success = False
-        else:
-            if G2S3.isS3Uri(fileSpec):
-                s3list = G2S3.ListOfS3UrisOfFilesInBucket(fileSpec, os.path.dirname(G2S3.getFilePathFromUri(fileSpec)))
-                fileList = fnmatch.filter(s3list, fileSpec)
+        for file in fileSpec:
+
+            parmDict = {}
+            parmString = ''
+            parmList = []
+            # Have additional parameters been specified?
+            if '/?' in file:
+                # Split what we are expecting and anything else discard
+                fileSpec, parmString, *_ = file.split('/?')
+                parmList = parmString.split(',')
+
+                for parm in parmList:
+                    if '=' in parm:
+                        parmType = parm.split('=')[0].strip().upper()
+                        parmValue = parm.split('=')[1].strip().replace('"','').replace("'",'').upper()
+                        parmDict[parmType] = parmValue
+            # If not additional parameters use file to enable easy file globbing where fileSpec would otherwise be a list and file isn't 
+            # fileSpec is a str when /? is present but a list when it isn't present this addresses that
             else:
-                if fileSpec.upper().startswith('FILE://'):
-                    fileSpec = fileSpec[7:]
-                try: fileList = glob.glob(fileSpec)
-                except: fileList = []
-            if not fileList:
-                print('ERROR: file specification did not return any files!')
+                fileSpec = file
+
+            #--try to determine file_format
+            if 'FILE_FORMAT' not in parmDict:
+                if 'FILE_TYPE' in parmDict:
+                    parmDict['FILE_FORMAT'] = parmDict['FILE_TYPE']
+                else:
+                    _, fileExtension = os.path.splitext(fileSpec)
+                    parmDict['FILE_FORMAT'] = fileExtension.replace('.','').upper()
+
+            if parmDict['FILE_FORMAT'] not in ('JSON', 'CSV', 'UMF', 'TAB', 'TSV', 'PIPE'):
+                print(textwrap.dedent(f'''\n
+                    ERROR: File format must be one of JSON, CSV, UMF, TAB, TSV, PIPE or specify file_format with the -f argument.
+
+                               - ./G2Loader.py -f my_file.csv/?data_source=EXAMPLE
+                               - ./G2Loader.py -f my_file.txt/?data_source=EXAMPLE,file_format=CSV
+
+                           - If using a wildcard such as -f files_to_load* all files must have the same extension or use file_format=<format>
+
+                               - ./G2Loader.py -f my_file*.csv/?data_source=EXAMPLE
+                               - ./G2Loader.py -f my_file*/?data_source=EXAMPLE,file_format=CSV
+
+                           - File format detected: {parmDict['FILE_FORMAT'] if parmDict['FILE_FORMAT'] else 'None'}
+                '''))                 
                 self.success = False
             else:
-                self.projectFileName = 'n/a'
-                self.projectFilePath = os.path.dirname(os.path.abspath(fileList[0]))
-                for fileName in fileList:
-                    sourceDict = {} 
-                    sourceDict['FILE_NAME'] = fileName
-                    sourceDict['FILE_FORMAT'] = parmDict['FILE_FORMAT']
-                    if 'DATA_SOURCE' in parmDict:
-                        sourceDict['DATA_SOURCE'] = parmDict['DATA_SOURCE'] 
-                    self.projectSourceList.append(sourceDict)
-
-                self.prepareSourceFiles()
+                if G2S3.isS3Uri(fileSpec):
+                    s3list = G2S3.ListOfS3UrisOfFilesInBucket(fileSpec, os.path.dirname(G2S3.getFilePathFromUri(fileSpec)))
+                    fileList = fnmatch.filter(s3list, fileSpec)
+                else:
+                    if fileSpec.upper().startswith('FILE://'):
+                        fileSpec = fileSpec[7:]
+                    try: 
+                        fileList = glob.glob(fileSpec)
+                    except: 
+                        fileList = []
+
+                if not fileList:
+                    print('ERROR: File specification did not return any files!')
+                    self.success = False
+                else:
+                    self.projectFileName = 'n/a'
+                    self.projectFilePath = os.path.dirname(os.path.abspath(fileList[0]))
+                    for fileName in fileList:
+                        sourceDict = {} 
+                        sourceDict['FILE_NAME'] = fileName
+                        sourceDict['FILE_FORMAT'] = parmDict['FILE_FORMAT']
+                        if 'DATA_SOURCE' in parmDict:
+                            sourceDict['DATA_SOURCE'] = parmDict['DATA_SOURCE'] 
+                        self.projectSourceList.append(sourceDict)
+
+        if self.success:
+            self.prepareSourceFiles()
 
         return
 
@@ -505,41 +558,48 @@ def loadJsonProject(self):
 
     #----------------------------------------
     def loadCsvProject(self):
-        fileData = {}
-        fileData['FILE_NAME'] = self.projectFileName
-        fileData['FILE_FORMAT'] = self.projectFileFormat
+
+        fileData = {
+            'FILE_NAME': self.projectFileName,
+            'FILE_FORMAT': self.projectFileFormat
+        }
+
         if self.projectFileFormat == 'CSV':
             fileData['DELIMITER'] = ','
         elif self.projectFileFormat in ('TSV', 'TAB'):
             fileData['DELIMITER'] = '\t'
         elif self.projectFileFormat == 'PIPE':
             fileData['DELIMITER'] = '|'
+
         fileData['MULTICHAR_DELIMITER'] = False
 
         csvFile = openPossiblyCompressedFile(self.projectFileName, 'r')
         fileData['HEADER_ROW'] = [x.strip().upper() for x in fileRowParser(next(csvFile), fileData)]
+
         if not(fileData['HEADER_ROW']):
             print('ERROR: project file does not contain a header row!')
             self.success = False
         elif not 'FILE_NAME' in fileData['HEADER_ROW']:
             print('ERROR: project file does not contain a column for FILE_NAME!')
             self.success = False
         else:
-
             for line in csvFile:
                 rowData = fileRowParser(line, fileData) 
                 if rowData: #--skip blank lines
                     self.projectSourceList.append(rowData)
+
         csvFile.close()
 
         return
 
     #----------------------------------------
     def prepareSourceFiles(self):
         ''' ensure project files referenced exist and are valid '''
-        print('')
+
+        print()
         self.sourceList = []
         sourceRow = 0
+
         for sourceDict in self.projectSourceList:
             sourceRow += 1
 
@@ -571,6 +631,7 @@ def prepareSourceFiles(self):
             #--csv stuff
             sourceDict['ENCODING'] = sourceDict['ENCODING'] if 'ENCODING' in sourceDict else 'utf-8-sig'
             sourceDict['DELIMITER'] = sourceDict['DELIMITER'] if 'DELIMITER' in sourceDict else None
+
             if not sourceDict['DELIMITER']:
                 if sourceDict['FILE_FORMAT'] == 'CSV':
                     sourceDict['DELIMITER'] = ','
@@ -693,7 +754,9 @@ def prepareSourceFiles(self):
 
                     fileReader.close()
 
-            self.sourceList.append(sourceDict)
+            if self.success:
+                self.sourceList.append(sourceDict)
+
         return
 
     #----------------------------------------
@@ -838,4 +901,3 @@ def compositeKeyBuilder(rowData, keyList):
     del myProject
 
     sys.exit()
-