Skip to content

Commit

Permalink
Crest Work (#87)
Browse files Browse the repository at this point in the history
* TDL-19742 Run discover mode if catalog is not given in sync mode (#85)

* TDL-19742 Run discover mode if catalog is not given in sync mode

* Add call count assertion and remove unused import

Co-authored-by: Jay Tilala <jay.tilala@CDSYS.LOCAL>

* TDL-14450: fix inconsistency of empty cell for boolean (#84)

* handled empty value scenario for the booleans

* Added unit test for null value in boolean

* Removed workaround from the integration test

Co-authored-by: harshpatel4crest <harsh.patel4@crestdatasys.com>

* TDL-14449: Syncs do not parse datetime values in number datatyped columns as strings (#83)

* updated the code to store datetime values as string in numberType column

* updated the test cases and handled boolean in numberType

* updated code for handling numberType values

* updated code to handle number format and int exponential values

* added comments for transforming number format data

* updated datatype integration test

* resolved comments

* resolved comments

* updated comments

* updated comments

* handled duplicate code in transform numberType values

* added comments

* added logger and removed redundant arg

* updated unittest and code

* updated comment

* updated bug id test case

* TDL-19029: Add support for data collection from the shared drives (#80)

* TDL-17517 - Add missing tap-tester cases (#65)

* Added missing test cases

* Updated standard bookmark test name

* Added back bookmark for full_table stream

* Updated comment

* Removed bookmark test case.

* TDL-17698: Dict based to class based refactoring (#66)

* refactored code to class based

* resolve unittest failure

* updated the code to write state after syncing records for file metadata stream

* added code change to return if file is not changed

* added code change to write file metadata bookmark at the end of the sync

* added function comments

* created a function to get path with query params

* updated code according to pylint

* resolved unittest failure

* TDL-19029: Add support for data collection from the shared drives

* change streams.py

* update README for supportsAllDrives

* remove invalid datatype exception

* include false as string or bool

* add unitest

* updated error message

* change default value

* update comments

* update parameter name snake case

* solved circleci error

* update comments in unittest

* Add exception for the invalid value of the supports_all_drives

* optimize code as per collin suggestion

* Addressed andy's comments

* rename import sync name

* rename import sync name

* updated unittest

Co-authored-by: Prijen Khokhani <88327452+prijendev@users.noreply.github.com>
Co-authored-by: Harsh <80324346+harshpatel4crest@users.noreply.github.com>
Co-authored-by: prijendev <prijen.khokhani@crestdatasys.com>
Co-authored-by: Jay Tilala <jay.tilala@CDSYS.LOCAL>
Co-authored-by: harshpatel4crest <harsh.patel4@crestdatasys.com>

* added param for shared drive for all syncs (#89)

Co-authored-by: jtilala <104966482+jtilala@users.noreply.github.com>
Co-authored-by: Jay Tilala <jay.tilala@CDSYS.LOCAL>
Co-authored-by: savan-chovatiya <80703490+savan-chovatiya@users.noreply.github.com>
Co-authored-by: Prijen Khokhani <88327452+prijendev@users.noreply.github.com>
Co-authored-by: prijendev <prijen.khokhani@crestdatasys.com>
  • Loading branch information
6 people authored Sep 1, 2022
1 parent db30219 commit d15ac18
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 35 deletions.
4 changes: 2 additions & 2 deletions tap_google_sheets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,10 @@ def main():

if parsed_args.discover:
do_discover(client, spreadsheet_id)
elif parsed_args.catalog:
else:
sync(client=client,
config=config,
catalog=parsed_args.catalog,
catalog=parsed_args.catalog or discover(client, spreadsheet_id),
state=state)

if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions tap_google_sheets/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,7 @@ def __init__(self,
self.__expires = None
self.__session = requests.Session()
self.base_url = None

# if request_timeout is other than 0,"0" or "" then use request_timeout
if request_timeout and float(request_timeout):
request_timeout = float(request_timeout)
Expand Down
8 changes: 5 additions & 3 deletions tap_google_sheets/streams.py
Original file line number Diff line number Diff line change
Expand Up @@ -251,7 +251,8 @@ class FileMetadata(GoogleSheets):
replication_method = "INCREMENTAL"
replication_keys = ["modifiedTime"]
params = {
"fields": "id,name,createdTime,modifiedTime,version,teamDriveId,driveId,lastModifyingUser"
"fields": "id,name,createdTime,modifiedTime,version,teamDriveId,driveId,lastModifyingUser",
"supportsAllDrives": True
}

def sync(self, catalog, state, selected_streams):
Expand Down Expand Up @@ -426,8 +427,9 @@ def new_transform(self, data, typ, schema, path):
return False, None

elif typ == "boolean":
# return the data as string itself if the value is of type string
if isinstance(data, str) and data is not None:
if data is None: # returns "null" if data is none
return True, None
if isinstance(data, str): # return the data as string itself if the value is of type string
return True, data
try:
return True, bool(data)
Expand Down
69 changes: 56 additions & 13 deletions tap_google_sheets/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,37 +145,80 @@ def transform_sheet_boolean_data(value, unformatted_value, sheet_title, col_name
return col_val

# transform decimal values in the sheet
def transform_sheet_decimal_data(value, sheet_title, col_name, col_letter, row_num, col_type):
def transform_sheet_decimal_data(formatted_value, unformatted_value, sheet_title, col_name, col_letter, row_num, col_type):
"""
Transform number type data and return according to the datatype in the sheet
:param
formatted_value - The displayed value of a cell in the sheet ie. 2022-01-01
unformatted_value - The calculated value of the field as per the value type ie. 44562,
(the date values are converted into serial numbers by Google's API)
sheet_title - The title of the sheet
col_name - Column name
col_letter - Column letter of the record ie. A, B, C, etc.
row_num - Row number of the record
col_type - Column type of the record (here: numberType)
"""

# Removing comma to handle US number type format ie. 123,456.10 -> 123456.10
numeric_value = formatted_value.replace(",", "")
try:
# Verify we can convert formatted value to float for scientific formatted numbers
# For example:
# formatted value: "1.23E+03"
# unformatted value: 1234
# thus, we can convert "1.23E+03" to float but, for int casting we get error and wrong value will be returned
float(numeric_value)
except ValueError:
LOGGER.info('Received the value for sheet: {}, column: {}, cell: {}{} with unexpected data type. Ingesting this value with string format in the target'.format(
sheet_title, col_name, col_letter, row_num
))
return str(formatted_value) # Return original value in case of ValueError

if type(unformatted_value) == int:
return unformatted_value

# For float type data, round off to 15 decimal digits
# Determine float decimal digits
decimal_digits = str(value)[::-1].find('.')
decimal_digits = str(unformatted_value)[::-1].find('.')
if decimal_digits > 15:
try:
# ROUND to multipleOf: 1e-15
col_val = float(round(value, 15))
col_val = float(round(unformatted_value, 15))
except ValueError:
col_val = str(value)
col_val = str(unformatted_value)
LOGGER.info('WARNING: POSSIBLE DATA TYPE ERROR; SHEET: {}, COL: {}, CELL: {}{}, TYPE: {}'.format(
sheet_title, col_name, col_letter, row_num, col_type))
return col_val
else: # decimal_digits <= 15, no rounding
try:
col_val = float(value)
col_val = float(unformatted_value)
except ValueError:
col_val = str(value)
col_val = str(unformatted_value)
LOGGER.info('WARNING: POSSIBLE DATA TYPE ERROR: SHEET: {}, COL: {}, CELL: {}{}, TYPE: {}'.format(
sheet_title, col_name, col_letter, row_num, col_type))
return col_val

# transform number values in the sheet
def transform_sheet_number_data(value, sheet_title, col_name, col_letter, row_num, col_type):
if type(value) == int:
return int(value)
elif type(value) == float:
return transform_sheet_decimal_data(value, sheet_title, col_name, col_letter, row_num, col_type)
def transform_sheet_number_data(formatted_value, unformatted_value, sheet_title, col_name, col_letter, row_num, col_type):
"""
Handled number type records by type-casting into a float to verify the user has entered number type data.
:param
formatted_value - The displayed value of a cell in the sheet
unformatted_value - The formatted value of a cell in the sheet
sheet_title - The title of the sheet
col_name - Column name
col_letter - Column letter of the record ie. A, B, C, etc.
row_num - Row number of the record
col_type - Column type of the record (here: numberType)
"""
if type(unformatted_value) in [int, float]:
return transform_sheet_decimal_data(formatted_value, unformatted_value, sheet_title, col_name, col_letter, row_num, col_type)
else:
LOGGER.info('WARNING: POSSIBLE DATA TYPE ERROR: SHEET: {}, COL: {}, CELL: {}{}, TYPE: {} '.format(
sheet_title, col_name, col_letter, row_num, col_type))
return str(value)
return str(unformatted_value)

# return transformed column the values based on the datatype
def get_column_value(value, unformatted_value, sheet_title, col_name, col_letter, row_num, col_type, row):
Expand All @@ -198,7 +241,7 @@ def get_column_value(value, unformatted_value, sheet_title, col_name, col_letter

# NUMBER (INTEGER AND FLOAT)
elif col_type == 'numberType':
return transform_sheet_number_data(unformatted_value, sheet_title, col_name, col_letter, row_num, col_type)
return transform_sheet_number_data(value, unformatted_value, sheet_title, col_name, col_letter, row_num, col_type)

# STRING
elif col_type == 'stringValue':
Expand Down
13 changes: 2 additions & 11 deletions tests/test_google_sheets_datatypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,33 +264,24 @@ def test_run(self):

if test_case is None or 'empty' in test_case: # some rows we expect empty values rather than strings

# BUG_TDL-14450 | https://jira.talendforge.org/browse/TDL-14450
# The boolean empty rows are getting parsed as False...but only when it's not the last column
if column == 'Boolean': # BUG_TDL-14450
continue # skip

# verify the expected rows are actually Null
self.assertIsNone(value)

# As "'0" returns false which does not satisfy th below test case for boolean column
elif value is not None or value != "":

# BUG_TDL-14449 | https://jira.talendforge.org/browse/TDL-14449
if test_case in {'date', 'time', 'datetime'} and column in {'Currency', 'Number'}: # BUG_TDL-14449
continue # skip

if column == 'Boolean' and value in (-1, 1, 0): # special integer values falls back to boolean
self.assertTrue(isinstance(value, bool), msg=f'test case: {test_case} value: {value}')
continue
# verify the non-standard value has fallen back to a string type
self.assertTrue(isinstance(value, str), msg=f'test case: {test_case} value: {value}')

# BUG_TDL-14431 [https://jira.talendforge.org/browse/TDL-14431]
# BUG_TDL-18932 [https://jira.talendforge.org/browse/TDL-18932]
# Date and Datetime do not fall back to string for boolean, time, or numbers

# verify dates, times and datetimes DO NOT COERCE VALUES to the standard format
if column in string_column_formats.keys():
if column in ["Date", "Datetime"] and sdc_row in [3, 4, 6, 7]: # BUG_TDL-14431
if column in ["Date", "Datetime"] and sdc_row in [3, 4, 6, 7]: # BUG_TDL-18932
continue # skip assertion

self.assertNotStringFormat(value, string_column_formats[column])
Expand Down
9 changes: 9 additions & 0 deletions tests/unittests/test_boolean_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,15 @@ def test_boolean_returned_for_boolean_columns(self):
transformed_data = new_transform(transformer, data, "boolean", schema, '')
self.assertEqual(transformed_data[1], True)

def test_null_returned_for_boolean_col(self):
'''
Verify that null value should not be replicated as False and it should be replicated as null.
'''
data = None
transformer = MockTransformer()
transformed_data = new_transform(transformer, data, "boolean", schema, '')
self.assertEqual(transformed_data[1], None)

def test_date_time_with_serial_number_1_in_boolean_col(self):
"""
Verify that dattime with serial number 1 returns string date instead of true.
Expand Down
34 changes: 34 additions & 0 deletions tests/unittests/test_catalog_in_sync.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import unittest
from unittest import mock
from tap_google_sheets import main


class MockedParseArgs:
discover = False
config = {"client_id":"", "client_secret": "", "refresh_token": "", "user_agent": ""}
state = False
catalog = "test"

@mock.patch("tap_google_sheets.discover")
@mock.patch("tap_google_sheets.singer.utils.parse_args", return_value=MockedParseArgs)
@mock.patch("tap_google_sheets.GoogleClient.__enter__", return_value="test")
@mock.patch("tap_google_sheets.sync")
class TestCatalog(unittest.TestCase):
def test_catalog_is_given_in_sync(self, mocked_sync, mocked_google_client, mocked_parse_args, mocked_discover):
"""
To verify that if catalog is given in sync mode then run with catalog file
"""
main()
mocked_sync.assert_called_with(client="test", config=MockedParseArgs.config, catalog="test", state={})
self.assertEqual(mocked_discover.call_count, 0, "discover function is not called expected times")

def test_catalog_is_not_given_in_sync(self, mocked_sync, mocked_google_client, mocked_parse_args, mocked_discover):
"""
To verify that if catalog is not given in sync mode then run discover mode to generate catalog
"""
# mocking discover function
MockedParseArgs.catalog = ""
mocked_discover.return_value = "test"
main()
mocked_sync.assert_called_with(client="test", config=MockedParseArgs.config, catalog="test", state={})
self.assertEqual(mocked_discover.call_count, 1, "discover function is not called expected times")
62 changes: 56 additions & 6 deletions tests/unittests/test_number_transform.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,75 @@
import unittest
from unittest import mock
from tap_google_sheets.transform import transform_sheet_number_data

class TestNumberTransform(unittest.TestCase):
"""Verify that boolean values falls back as string"""
def test_number_transform_boolean_as_string(self):

@mock.patch("tap_google_sheets.transform.LOGGER.info")
def test_number_transform_boolean_as_string(self, mocked_logger_info):
"""Verify that boolean values falls back as string"""
value = True
transformed_data = transform_sheet_number_data(value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
transformed_data = transform_sheet_number_data("TRUE", value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
self.assertIsInstance(transformed_data, str)
self.assertEqual(transformed_data, "True")
# verify warning logger is called with expected params
mocked_logger_info.assert_called_with("WARNING: POSSIBLE DATA TYPE ERROR: SHEET: {}, COL: {}, CELL: {}{}, TYPE: {} ".format(
"test-sheet", "test-column", "col", 1, "numberType"))

def test_number_transform_int_value_as_int(self):
"""Verify that int values falls back as type int"""
value = 1
transformed_data = transform_sheet_number_data(value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
transformed_data = transform_sheet_number_data("1", value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
self.assertIsInstance(transformed_data, int)
self.assertEqual(transformed_data, 1)

def test_number_transform_int_exponential_value_as_int(self):
"""Verify that exponential int values falls back as type int"""
value = 1234
transformed_data = transform_sheet_number_data("1.23E+03", value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
self.assertIsInstance(transformed_data, int)
self.assertEqual(transformed_data, 1234)

def test_number_transform_int_US_format_value_as_int(self):
"""Verify that US format int values falls back as type int"""
value = 1234
transformed_data = transform_sheet_number_data("1,234", value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
self.assertIsInstance(transformed_data, int)
self.assertEqual(transformed_data, 1234)

def test_number_transform_float_value_as_float(self):
"""Verify that float values falls back as type float"""
value = 1.0
transformed_data = transform_sheet_number_data(value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
value = 1.1
transformed_data = transform_sheet_number_data("1.1", value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
self.assertIsInstance(transformed_data, float)
self.assertEqual(transformed_data, 1.1)

def test_number_transform_float_exponential_value_as_float(self):
"""Verify that exponential float values falls back as type float"""
value = 5e-16
transformed_data = transform_sheet_number_data("5.00E-16", value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
self.assertIsInstance(transformed_data, float)
self.assertEqual(transformed_data, 1.0)
self.assertEqual(transformed_data, 5e-16)

def test_number_transform_float_US_format_value_as_float(self):
"""Verify that US format float values falls back as type float"""
value = 1234.1
transformed_data = transform_sheet_number_data("1,234.1", value, sheet_title='test-sheet', col_name='test-column', col_letter='col', row_num=1, col_type='numberType')
self.assertIsInstance(transformed_data, float)
self.assertEqual(transformed_data, 1234.1)

def test_number_transform_datetime_value_as_string(self):
"""Verify that datetime values falls back as type string"""

datetime_expected_value = transform_sheet_number_data("01/01/2022 0:00:00", 44562, "test_sheet", "Number Column", "A", 4, "numberType")

self.assertEqual(datetime_expected_value, "01/01/2022 0:00:00")
self.assertIsInstance(datetime_expected_value, str)

def test_number_transform_time_value_as_string(self):
"""Verify that time values falls back as type string"""

time_expected_value = transform_sheet_number_data("5:00 PM", 0.7083333333333334, "test_sheet", "Number Column", "A", 5, "numberType")

self.assertEqual(time_expected_value, "5:00 PM")
self.assertIsInstance(time_expected_value, str)

0 comments on commit d15ac18

Please sign in to comment.