Merge pull request #85 from UDST/enhancement/remove-whitespace-from-r…

…elational-cols Enhancement/remove whitespace from relational cols
UDST · Mar 11, 2021 · 44b3a6a · 44b3a6a
2 parents 605fe9f + de247d6
commit 44b3a6a
Show file tree

Hide file tree

Showing 3 changed files with 172 additions and 22 deletions.
diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py
@@ -488,6 +488,12 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df):
     stop_times_df = stop_times_df[
         stop_times_df['unique_trip_id'].isin(uniquetriplist)]
 
+    # if there were no records that match then do not proceed and throw error
+    if len(stop_times_df) == 0:
+        raise ValueError('No matching trip_ids where found. Suggest checking '
+                         'for differences between trip_id values in '
+                         'stop_times and trips GTFS files.')
+
     # count missing stop times
     missing_stop_times_count = stop_times_df[
         'departure_time_sec'].isnull().sum()

diff --git a/urbanaccess/gtfs/utils_format.py b/urbanaccess/gtfs/utils_format.py
@@ -14,7 +14,7 @@ def _read_gtfs_agency(textfile_path, textfile):
     Parameters
     ----------
     textfile_path : str
-        director of text file
+        directory of text file
     textfile : str
         name of text file
 
@@ -29,8 +29,10 @@ def _read_gtfs_agency(textfile_path, textfile):
     if len(df) == 0:
         raise ValueError('{} has no records'.format(os.path.join(
             textfile_path, textfile)))
+
     # remove any extra whitespace in column names
-    df.rename(columns=lambda x: x.strip(), inplace=True)
+    df = _remove_whitespace(df=df, textfile=textfile, col_list=None)
+
     return df
 
 
@@ -41,7 +43,7 @@ def _read_gtfs_stops(textfile_path, textfile):
     Parameters
     ----------
     textfile_path : str
-        director of text file
+        directory of text file
     textfile : str
         name of text file
 
@@ -62,8 +64,10 @@ def _read_gtfs_stops(textfile_path, textfile):
     df['stop_lat'] = pd.to_numeric(df['stop_lat'])
     df['stop_lon'] = pd.to_numeric(df['stop_lon'])
 
-    # remove any extra whitespace in column names
-    df.rename(columns=lambda x: x.strip(), inplace=True)
+    # remove extra whitespace that may exist in col names or before and
+    # after the value for columns that are used across different GTFS files
+    df = _remove_whitespace(df=df, textfile=textfile, col_list=['stop_id'])
+
     return df
 
 
@@ -74,7 +78,7 @@ def _read_gtfs_routes(textfile_path, textfile):
     Parameters
     ----------
     textfile_path : str
-        director of text file
+        directory of text file
     textfile : str
         name of text file
 
@@ -90,8 +94,11 @@ def _read_gtfs_routes(textfile_path, textfile):
     if len(df) == 0:
         raise ValueError('{} has no records'.format(os.path.join(
             textfile_path, textfile)))
-    # remove any extra whitespace in column names
-    df.rename(columns=lambda x: x.strip(), inplace=True)
+
+    # remove extra whitespace that may exist in col names or before and
+    # after the value for columns that are used across different GTFS files
+    df = _remove_whitespace(df=df, textfile=textfile, col_list=['route_id'])
+
     return df
 
 
@@ -102,7 +109,7 @@ def _read_gtfs_trips(textfile_path, textfile):
     Parameters
     ----------
     textfile_path : str
-        director of text file
+        directory of text file
     textfile : str
         name of text file
 
@@ -118,13 +125,18 @@ def _read_gtfs_trips(textfile_path, textfile):
                             'service_id': object,
                             'route_id': object,
                             7: object}, low_memory=False)
-    # 7 is placeholder for shape id
-    # which may not exist in some txt files
+    # 7 is placeholder for shape id which may not exist in some txt files
     if len(df) == 0:
         raise ValueError('{} has no records'.format(os.path.join(
             textfile_path, textfile)))
-    # remove any extra whitespace in column names
-    df.rename(columns=lambda x: x.strip(), inplace=True)
+
+    # remove extra whitespace that may exist in col names or before and
+    # after the value for columns that are used across different GTFS files
+    df = _remove_whitespace(
+        df=df,
+        textfile=textfile,
+        col_list=['trip_id', 'service_id', 'route_id'])
+
     return df
 
 
@@ -135,7 +147,7 @@ def _read_gtfs_stop_times(textfile_path, textfile):
     Parameters
     ----------
     textfile_path : str
-        director of text file
+        directory of text file
     textfile : str
         name of text file
 
@@ -154,8 +166,12 @@ def _read_gtfs_stop_times(textfile_path, textfile):
     if len(df) == 0:
         raise ValueError('{} has no records'.format(os.path.join(
             textfile_path, textfile)))
-    # remove any extra whitespace in column names
-    df.rename(columns=lambda x: x.strip(), inplace=True)
+
+    # remove extra whitespace that may exist in col names or before and
+    # after the value for columns that are used across different GTFS files
+    df = _remove_whitespace(
+        df=df, textfile=textfile, col_list=['trip_id', 'stop_id'])
+
     return df
 
 
@@ -166,7 +182,7 @@ def _read_gtfs_calendar(textfile_path, textfile):
     Parameters
     ----------
     textfile_path : str
-        director of text file
+        directory of text file
     textfile : str
         name of text file
 
@@ -189,8 +205,11 @@ def _read_gtfs_calendar(textfile_path, textfile):
                   'saturday', 'sunday']
     for col in columnlist:
         df[col] = pd.to_numeric(df[col])
-    # remove any extra whitespace in column names
-    df.rename(columns=lambda x: x.strip(), inplace=True)
+
+    # remove extra whitespace that may exist in col names or before and
+    # after the value for columns that are used across different GTFS files
+    df = _remove_whitespace(df=df, textfile=textfile, col_list=['service_id'])
+
     return df
 
 
@@ -201,7 +220,7 @@ def _read_gtfs_calendar_dates(textfile_path, textfile):
     Parameters
     ----------
     textfile_path : str
-        director of text file
+        directory of text file
     textfile : str
         name of text file
 
@@ -220,8 +239,10 @@ def _read_gtfs_calendar_dates(textfile_path, textfile):
         log(warning_msg.format(os.path.join(
             textfile_path, textfile)), level=lg.WARNING)
 
-    # remove any extra whitespace in column names
-    df.rename(columns=lambda x: x.strip(), inplace=True)
+    # remove extra whitespace that may exist in col names or before and
+    # after the value for columns that are used across different GTFS files
+    df = _remove_whitespace(df=df, textfile=textfile, col_list=['service_id'])
+
     return df
 
 
@@ -1090,3 +1111,49 @@ def _generate_unique_feed_id(feed_folder):
                                                                       'and')
 
     return folder_snake_case_no_amps.lower()
+
+
+def _remove_whitespace(df, textfile, col_list=None):
+    """
+    Remove leading and trailing spaces in values in specified DataFrame
+    columns and or also remove spaces in column names.
+
+    Parameters
+    ----------
+    df : pandas.DataFrame
+        DataFrame to process
+    textfile : str
+        name of text file
+    col_list : list, optional
+        If specified, list of column names as strings to check for
+        whitespaces in values
+
+    Returns
+    -------
+    df : pandas.DataFrame
+    """
+
+    # remove leading and trailing spaces in column names
+    before_cols = sorted(list(df.columns))
+    df.rename(columns=lambda x: x.strip(), inplace=True)
+    after_cols = sorted(list(df.columns))
+    if before_cols != after_cols:
+        cols_with_spaces = list(set(before_cols) - set(after_cols))
+        log('GTFS file: {} column(s): {} had leading and or trailing '
+            'whitespace in column names. Spaces have been removed.'.format(
+             textfile, cols_with_spaces))
+
+    # remove leading and trailing spaces in values for columns in list
+    if col_list:
+        df_copy = df.copy()
+        for col in col_list:
+            before_count = df_copy[col].str.len().sum()
+            df_copy[col] = df_copy[col].str.rstrip().str.lstrip()
+            after_count = df_copy[col].str.len().sum()
+            # only perform whitespace strip on columns that need it
+            if before_count != after_count:
+                df[col] = df[col].str.rstrip().str.lstrip()
+                log('GTFS file: {} column: {} had leading and or trailing '
+                    'whitespace in its values. Spaces have been '
+                    'removed.'.format(textfile, col))
+    return df
diff --git a/urbanaccess/tests/test_gtfs_utils_format.py b/urbanaccess/tests/test_gtfs_utils_format.py
@@ -21,6 +21,50 @@ def folder_feed_4():
     return r'/data/gtfs_feeds/city'
 
 
+@pytest.fixture()
+def trips_feed_w_invalid_values(tmpdir):
+    # create df with ints instead of str, col names with spaces, and
+    # values with spaces before and after the value for relational columns
+    data = {
+        'route_id': ['10-101', '10-101', '10-101', '10-101',
+                     111, '00111', '12-101', '12-101',
+                     '13-101', '13-101'],
+        'trip_id': ['a1   ', '   a2', '   a3   ', 'a   4',
+                    'b1', 'b2', 'c1', 'c2', 'd1', 'd2'],
+        'service_id  ': ['weekday   -1', 'weekday-1   ', 'weekday-1',
+                         'weekday-1', 'weekday-2', 'weekday-2',
+                         'weekday-3', 'weekday-3', 'weekend-1', 'weekend-1'],
+        '    direction_id    ': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
+        'wheelchair_    accessible': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
+        'bikes_allowed': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
+    }
+    index = range(10)
+    raw_df = pd.DataFrame(data, index)
+
+    feed_path = os.path.join(tmpdir.strpath, 'test_trips_invalid_values')
+    os.makedirs(feed_path)
+    print('writing test data to dir: {}'.format(feed_path))
+    feed_file_name = '{}.txt'.format('trips')
+    raw_df.to_csv(os.path.join(feed_path, feed_file_name), index=False)
+
+    data = {
+        'route_id': ['10-101', '10-101', '10-101', '10-101',
+                     '111', '00111', '12-101', '12-101', '13-101', '13-101'],
+        'trip_id': ['a1', 'a2', 'a3', 'a   4',
+                    'b1', 'b2', 'c1', 'c2', 'd1', 'd2'],
+        'service_id': ['weekday   -1', 'weekday-1', 'weekday-1',
+                       'weekday-1', 'weekday-2', 'weekday-2',
+                       'weekday-3', 'weekday-3', 'weekend-1', 'weekend-1'],
+        'direction_id': [1, 0, 1, 0, 1, 0, 1, 0, 1, 0],
+        'wheelchair_    accessible': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0],
+        'bikes_allowed': [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]
+    }
+    index = range(10)
+    expected_df = pd.DataFrame(data, index)
+
+    return raw_df, expected_df, feed_path
+
+
 def test_calendar_dates_agencyid_feed_1(calendar_dates_feed_1,
                                         routes_feed_1,
                                         trips_feed_1,
@@ -866,3 +910,36 @@ def test_add_unique_gtfsfeed_id(stops_feed_1, routes_feed_1, trips_feed_1,
         # identical to the cols in input df
         original_cols = df_dict[df][1].columns
         assert df_dict[df][1].equals(df_dict[df][0][original_cols])
+
+
+def test_remove_whitespace_from_values(trips_feed_w_invalid_values):
+    raw_df, expected_df, feed_path = trips_feed_w_invalid_values
+
+    # convert the one int record to str to match dtype of what would be read by
+    # read_gtfs function
+    raw_df['route_id'] = raw_df['route_id'].astype('str')
+
+    # test when col_list is used
+    result = utils_format._remove_whitespace(
+        df=raw_df,
+        textfile='trips.txt',
+        col_list=['trip_id', 'service_id', 'route_id'])
+    assert result.equals(expected_df)
+
+    # test when no col_list is used
+    result_no_col_list = utils_format._remove_whitespace(
+        df=raw_df,
+        textfile='trips.txt',
+        col_list=None)
+    # spaces in cols should be removed
+    assert list(result_no_col_list.columns) == list(expected_df.columns)
+    # spaces in values should remain
+    assert result_no_col_list['trip_id'].str.len().sum() == raw_df[
+        'trip_id'].str.len().sum()
+
+
+def test_read_gtfs_trips_w_invalid_values(trips_feed_w_invalid_values):
+    raw_df, expected_df, feed_path = trips_feed_w_invalid_values
+    result = utils_format._read_gtfs_trips(
+        textfile_path=feed_path, textfile='trips.txt')
+    assert result.equals(expected_df)