Skip to content

Commit

Permalink
Merge pull request #85 from UDST/enhancement/remove-whitespace-from-r…
Browse files Browse the repository at this point in the history
…elational-cols

Enhancement/remove whitespace from relational cols
  • Loading branch information
smmaurer authored Mar 11, 2021
2 parents 605fe9f + de247d6 commit 44b3a6a
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 22 deletions.
6 changes: 6 additions & 0 deletions urbanaccess/gtfs/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -488,6 +488,12 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df):
stop_times_df = stop_times_df[
stop_times_df['unique_trip_id'].isin(uniquetriplist)]

# if there were no records that match then do not proceed and throw error
if len(stop_times_df) == 0:
raise ValueError('No matching trip_ids where found. Suggest checking '
'for differences between trip_id values in '
'stop_times and trips GTFS files.')

# count missing stop times
missing_stop_times_count = stop_times_df[
'departure_time_sec'].isnull().sum()
Expand Down
111 changes: 89 additions & 22 deletions urbanaccess/gtfs/utils_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def _read_gtfs_agency(textfile_path, textfile):
Parameters
----------
textfile_path : str
director of text file
directory of text file
textfile : str
name of text file
Expand All @@ -29,8 +29,10 @@ def _read_gtfs_agency(textfile_path, textfile):
if len(df) == 0:
raise ValueError('{} has no records'.format(os.path.join(
textfile_path, textfile)))

# remove any extra whitespace in column names
df.rename(columns=lambda x: x.strip(), inplace=True)
df = _remove_whitespace(df=df, textfile=textfile, col_list=None)

return df


Expand All @@ -41,7 +43,7 @@ def _read_gtfs_stops(textfile_path, textfile):
Parameters
----------
textfile_path : str
director of text file
directory of text file
textfile : str
name of text file
Expand All @@ -62,8 +64,10 @@ def _read_gtfs_stops(textfile_path, textfile):
df['stop_lat'] = pd.to_numeric(df['stop_lat'])
df['stop_lon'] = pd.to_numeric(df['stop_lon'])

# remove any extra whitespace in column names
df.rename(columns=lambda x: x.strip(), inplace=True)
# remove extra whitespace that may exist in col names or before and
# after the value for columns that are used across different GTFS files
df = _remove_whitespace(df=df, textfile=textfile, col_list=['stop_id'])

return df


Expand All @@ -74,7 +78,7 @@ def _read_gtfs_routes(textfile_path, textfile):
Parameters
----------
textfile_path : str
director of text file
directory of text file
textfile : str
name of text file
Expand All @@ -90,8 +94,11 @@ def _read_gtfs_routes(textfile_path, textfile):
if len(df) == 0:
raise ValueError('{} has no records'.format(os.path.join(
textfile_path, textfile)))
# remove any extra whitespace in column names
df.rename(columns=lambda x: x.strip(), inplace=True)

# remove extra whitespace that may exist in col names or before and
# after the value for columns that are used across different GTFS files
df = _remove_whitespace(df=df, textfile=textfile, col_list=['route_id'])

return df


Expand All @@ -102,7 +109,7 @@ def _read_gtfs_trips(textfile_path, textfile):
Parameters
----------
textfile_path : str
director of text file
directory of text file
textfile : str
name of text file
Expand All @@ -118,13 +125,18 @@ def _read_gtfs_trips(textfile_path, textfile):
'service_id': object,
'route_id': object,
7: object}, low_memory=False)
# 7 is placeholder for shape id
# which may not exist in some txt files
# 7 is placeholder for shape id which may not exist in some txt files
if len(df) == 0:
raise ValueError('{} has no records'.format(os.path.join(
textfile_path, textfile)))
# remove any extra whitespace in column names
df.rename(columns=lambda x: x.strip(), inplace=True)

# remove extra whitespace that may exist in col names or before and
# after the value for columns that are used across different GTFS files
df = _remove_whitespace(
df=df,
textfile=textfile,
col_list=['trip_id', 'service_id', 'route_id'])

return df


Expand All @@ -135,7 +147,7 @@ def _read_gtfs_stop_times(textfile_path, textfile):
Parameters
----------
textfile_path : str
director of text file
directory of text file
textfile : str
name of text file
Expand All @@ -154,8 +166,12 @@ def _read_gtfs_stop_times(textfile_path, textfile):
if len(df) == 0:
raise ValueError('{} has no records'.format(os.path.join(
textfile_path, textfile)))
# remove any extra whitespace in column names
df.rename(columns=lambda x: x.strip(), inplace=True)

# remove extra whitespace that may exist in col names or before and
# after the value for columns that are used across different GTFS files
df = _remove_whitespace(
df=df, textfile=textfile, col_list=['trip_id', 'stop_id'])

return df


Expand All @@ -166,7 +182,7 @@ def _read_gtfs_calendar(textfile_path, textfile):
Parameters
----------
textfile_path : str
director of text file
directory of text file
textfile : str
name of text file
Expand All @@ -189,8 +205,11 @@ def _read_gtfs_calendar(textfile_path, textfile):
'saturday', 'sunday']
for col in columnlist:
df[col] = pd.to_numeric(df[col])
# remove any extra whitespace in column names
df.rename(columns=lambda x: x.strip(), inplace=True)

# remove extra whitespace that may exist in col names or before and
# after the value for columns that are used across different GTFS files
df = _remove_whitespace(df=df, textfile=textfile, col_list=['service_id'])

return df


Expand All @@ -201,7 +220,7 @@ def _read_gtfs_calendar_dates(textfile_path, textfile):
Parameters
----------
textfile_path : str
director of text file
directory of text file
textfile : str
name of text file
Expand All @@ -220,8 +239,10 @@ def _read_gtfs_calendar_dates(textfile_path, textfile):
log(warning_msg.format(os.path.join(
textfile_path, textfile)), level=lg.WARNING)

# remove any extra whitespace in column names
df.rename(columns=lambda x: x.strip(), inplace=True)
# remove extra whitespace that may exist in col names or before and
# after the value for columns that are used across different GTFS files
df = _remove_whitespace(df=df, textfile=textfile, col_list=['service_id'])

return df


Expand Down Expand Up @@ -1090,3 +1111,49 @@ def _generate_unique_feed_id(feed_folder):
'and')

return folder_snake_case_no_amps.lower()


def _remove_whitespace(df, textfile, col_list=None):
"""
Remove leading and trailing spaces in values in specified DataFrame
columns and or also remove spaces in column names.
Parameters
----------
df : pandas.DataFrame
DataFrame to process
textfile : str
name of text file
col_list : list, optional
If specified, list of column names as strings to check for
whitespaces in values
Returns
-------
df : pandas.DataFrame
"""

# remove leading and trailing spaces in column names
before_cols = sorted(list(df.columns))
df.rename(columns=lambda x: x.strip(), inplace=True)
after_cols = sorted(list(df.columns))
if before_cols != after_cols:
cols_with_spaces = list(set(before_cols) - set(after_cols))
log('GTFS file: {} column(s): {} had leading and or trailing '
'whitespace in column names. Spaces have been removed.'.format(
textfile, cols_with_spaces))

# remove leading and trailing spaces in values for columns in list
if col_list:
df_copy = df.copy()
for col in col_list:
before_count = df_copy[col].str.len().sum()
df_copy[col] = df_copy[col].str.rstrip().str.lstrip()
after_count = df_copy[col].str.len().sum()
# only perform whitespace strip on columns that need it
if before_count != after_count:
df[col] = df[col].str.rstrip().str.lstrip()
log('GTFS file: {} column: {} had leading and or trailing '
'whitespace in its values. Spaces have been '
'removed.'.format(textfile, col))
return df
77 changes: 77 additions & 0 deletions urbanaccess/tests/test_gtfs_utils_format.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,50 @@ def folder_feed_4():
return r'/data/gtfs_feeds/city'


@pytest.fixture()
def trips_feed_w_invalid_values(tmpdir):
# create df with ints instead of str, col names with spaces, and
# values with spaces before and after the value for relational columns
data = {
'route_id': ['10-101', '10-101', '10-101', '10-101',
111, '00111', '12-101', '12-101',
'13-101', '13-101'],
'trip_id': ['a1 ', ' a2', ' a3 ', 'a 4',
'b1', 'b2', 'c1', 'c2', 'd1', 'd2'],
'service_id ': ['weekday -1', 'weekday-1 ', 'weekday-1',
'weekday-1', 'weekday-2', 'weekday-2',
'weekday-3', 'weekday-3', 'weekend-1', 'weekend-1'],
' direction_id ': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
'wheelchair_ accessible': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
'bikes_allowed': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
}
index = range(10)
raw_df = pd.DataFrame(data, index)

feed_path = os.path.join(tmpdir.strpath, 'test_trips_invalid_values')
os.makedirs(feed_path)
print('writing test data to dir: {}'.format(feed_path))
feed_file_name = '{}.txt'.format('trips')
raw_df.to_csv(os.path.join(feed_path, feed_file_name), index=False)

data = {
'route_id': ['10-101', '10-101', '10-101', '10-101',
'111', '00111', '12-101', '12-101', '13-101', '13-101'],
'trip_id': ['a1', 'a2', 'a3', 'a 4',
'b1', 'b2', 'c1', 'c2', 'd1', 'd2'],
'service_id': ['weekday -1', 'weekday-1', 'weekday-1',
'weekday-1', 'weekday-2', 'weekday-2',
'weekday-3', 'weekday-3', 'weekend-1', 'weekend-1'],
'direction_id': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
'wheelchair_ accessible': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
'bikes_allowed': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
}
index = range(10)
expected_df = pd.DataFrame(data, index)

return raw_df, expected_df, feed_path


def test_calendar_dates_agencyid_feed_1(calendar_dates_feed_1,
routes_feed_1,
trips_feed_1,
Expand Down Expand Up @@ -866,3 +910,36 @@ def test_add_unique_gtfsfeed_id(stops_feed_1, routes_feed_1, trips_feed_1,
# identical to the cols in input df
original_cols = df_dict[df][1].columns
assert df_dict[df][1].equals(df_dict[df][0][original_cols])


def test_remove_whitespace_from_values(trips_feed_w_invalid_values):
raw_df, expected_df, feed_path = trips_feed_w_invalid_values

# convert the one int record to str to match dtype of what would be read by
# read_gtfs function
raw_df['route_id'] = raw_df['route_id'].astype('str')

# test when col_list is used
result = utils_format._remove_whitespace(
df=raw_df,
textfile='trips.txt',
col_list=['trip_id', 'service_id', 'route_id'])
assert result.equals(expected_df)

# test when no col_list is used
result_no_col_list = utils_format._remove_whitespace(
df=raw_df,
textfile='trips.txt',
col_list=None)
# spaces in cols should be removed
assert list(result_no_col_list.columns) == list(expected_df.columns)
# spaces in values should remain
assert result_no_col_list['trip_id'].str.len().sum() == raw_df[
'trip_id'].str.len().sum()


def test_read_gtfs_trips_w_invalid_values(trips_feed_w_invalid_values):
raw_df, expected_df, feed_path = trips_feed_w_invalid_values
result = utils_format._read_gtfs_trips(
textfile_path=feed_path, textfile='trips.txt')
assert result.equals(expected_df)

0 comments on commit 44b3a6a

Please sign in to comment.