From 531c394d48784cd7bbff70a84f20a9f73731fa1f Mon Sep 17 00:00:00 2001 From: sablanchard Date: Thu, 29 Apr 2021 15:13:57 -0700 Subject: [PATCH] GTFS network refactor (#86) --- .travis.yml | 1 + CONTRIBUTING.md | 68 +- urbanaccess/config.py | 8 +- urbanaccess/gtfs/headways.py | 60 +- urbanaccess/gtfs/network.py | 865 ++++++------ urbanaccess/gtfs/utils_format.py | 40 +- urbanaccess/gtfs/utils_validation.py | 37 + urbanaccess/gtfsfeeds.py | 6 +- urbanaccess/network.py | 46 +- urbanaccess/osm/load.py | 2 +- urbanaccess/osm/network.py | 2 +- urbanaccess/plot.py | 12 +- urbanaccess/tests/conftest.py | 3 +- urbanaccess/tests/test_gtfs_load.py | 21 +- urbanaccess/tests/test_gtfs_network.py | 1192 ++++++++++++++++- urbanaccess/tests/test_gtfs_utils_format.py | 6 + .../tests/test_gtfs_utils_validation.py | 33 + urbanaccess/tests/test_gtfsfeeds.py | 2 +- urbanaccess/utils.py | 85 +- 19 files changed, 1852 insertions(+), 637 deletions(-) create mode 100644 urbanaccess/tests/test_gtfs_utils_validation.py diff --git a/.travis.yml b/.travis.yml index 7f77eae..1e6244e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -21,6 +21,7 @@ install: - conda create -n test-environment python=$TRAVIS_PYTHON_VERSION pyyaml --file requirements-dev.txt - source activate test-environment - conda info --all + - pip install 'numpy>=1.18' - pip install . - pip list - pip show urbanaccess diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index b91e6e2..5c082b3 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,20 +1,68 @@ +Thanks for using UrbanAccess! + +This is an open source project that's part of the Urban Data Science Toolkit. Development and maintenance is a collaboration between UrbanSim Inc, U.C. Berkeley's Urban Analytics Lab, and other contributors. + +## If you have a problem: + +- Take a look at the [open issues](https://github.com/UDST/urbanaccess/issues) and [closed issues](https://github.com/UDST/urbanaccess/issues?q=is%3Aissue+is%3Aclosed) to see if there's already a related discussion + +- Open a new issue describing the problem -- if possible, include any error messages, a full reproducible example of the code that generated the error, the operating system and version of Python you're using, and versions of any libraries that may be relevant + +## Feature proposals: + +- Take a look at the [open issues](https://github.com/UDST/urbanaccess/issues) and [closed issues](https://github.com/UDST/urbanaccess/issues?q=is%3Aissue+is%3Aclosed) to see if there's already a related discussion + +- Post your proposal as a new issue, so we can discuss it (some proposals may not be a good fit for the project) + +## Contributing code: + +- Create a new branch of `UDST/urbanaccess/dev`, or fork the repository to your own account + +- Make your changes, following the existing styles for code and inline documentation + +- Add [tests](https://github.com/UDST/urbanaccess/tree/dev/urbanaccess/tests) if possible + - We use the test suite: Pytest + +- Run tests and address any issues that may be flagged. If flags are raised that are not due to the PR note that in a new comment in the PR + - Run Pytest test suite: `py.test` + - UrbanAccess currently supports Python 2.7, 3.5, 3.6, 3.7, 3.8. Tests will be run in these environments when the PR is created but any flags raised in these environments should also be addressed + - UrbanAccess also uses a series of integration tests to test entire workflows, run the integration tests: + - Run: + ```cd demo + jupyter nbconvert --to python simple_example.ipynb + cd ../urbanaccess/tests/integration + python remove_nb_magic.py -in simple_example.py -out simple_example_clean.py + cd ../../../demo + python simple_example_clean.py + cd ../urbanaccess/tests/integration + python integration_madison.py + python integration_sandiego.py + - Run pycodestyle Python style guide checker: `pycodestyle --max-line-length=100 urbanaccess` + +- Open a pull request to the `UDST/urbanaccess` `dev` branch, including a writeup of your changes -- take a look at some of the closed PR's for examples + +- Current maintainers will review the code, suggest changes, and hopefully merge it and schedule it for an upcoming release + +## Updating the documentation: + +- See instructions in `docs/README.md` + ## Preparing a release: - Make a new branch for release prep -- Update the version number and changelog +- Update the version number and changelog: - `CHANGELOG.md` - `setup.py` - `urbanaccess/__init__.py` - - `docs/source/conf.py` - `docs/source/index.rst` + - `docs/source/conf.py` - Make sure all the tests are passing, and check if updates are needed to `README.md` or to the documentation -- Open a pull request to the master branch to finalize it - -- After merging, tag the release on GitHub and follow the distribution procedures below +- Open a pull request to the `dev` branch to finalize it and wait for a PR review and approval +- After the PR has been approved, it can be merged to `dev`. Then a release PR can be created from `dev` to merge into `master`. Once merged, tag the release on GitHub and follow the distribution procedures below: ## Distributing a release on PyPI (for pip installation): @@ -24,17 +72,21 @@ - Run `python setup.py sdist bdist_wheel --universal` -- This should create a `dist` directory containing two package files -- delete any old ones before the next step +- This should create a `dist` directory containing a gzip package file -- delete any old ones before the next step - Run `twine upload dist/*` -- this will prompt you for your pypi.org credentials -- Check https://pypi.org/project/osmnet/ for the new version +- Check https://pypi.org/project/urbanaccess/ for the new version ## Distributing a release on Conda Forge (for conda installation): -- The [conda-forge/urbanaccess-feedstock](https://github.com/conda-forge/urbanaccess-feedstock) repository controls the Conda Forge release +- The [conda-forge/urbanaccess-feedstock](https://github.com/conda-forge/urbanaccess-feedstock) repository controls the Conda Forge release, including which GitHub users have maintainer status for the repo - Conda Forge bots usually detect new releases on PyPI and set in motion the appropriate feedstock updates, which a current maintainer will need to approve and merge +- Maintainers can add on additional changes before merging the PR, for example to update the requirements or edit the list of maintainers + +- You can also fork the feedstock and open a PR manually. It seems like this must be done from a personal account (not a group account like UDST) so that the bots can be granted permission for automated cleanup + - Check https://anaconda.org/conda-forge/urbanaccess for the new version (may take a few minutes for it to appear) \ No newline at end of file diff --git a/urbanaccess/config.py b/urbanaccess/config.py index cba8f25..9902ff2 100644 --- a/urbanaccess/config.py +++ b/urbanaccess/config.py @@ -4,7 +4,7 @@ def _format_check(settings): """ - Check the format of a urbanaccess_config object. + Check the format of an urbanaccess_config object. Parameters ---------- @@ -84,7 +84,7 @@ def __init__(self, def from_yaml(cls, configdir='configs', yamlname='urbanaccess_config.yaml'): """ - Create a urbanaccess_config instance from a saved YAML configuration. + Create an urbanaccess_config instance from a saved YAML configuration. Parameters ---------- @@ -108,7 +108,7 @@ def from_yaml(cls, configdir='configs', yaml_file = os.path.join(configdir, yamlname) with open(yaml_file, 'r') as f: - yaml_config = yaml.load(f) + yaml_config = yaml.safe_load(f) settings = cls(data_folder=yaml_config.get('data_folder', 'data'), logs_folder=yaml_config.get('logs_folder', 'logs'), @@ -143,7 +143,7 @@ def to_dict(self): def to_yaml(self, configdir='configs', yamlname='urbanaccess_config.yaml', overwrite=False): """ - Save a urbanaccess_config representation to a YAML file. + Save an urbanaccess_config representation to a YAML file. Parameters ---------- diff --git a/urbanaccess/gtfs/headways.py b/urbanaccess/gtfs/headways.py index 0b62a0a..2e4b8ca 100644 --- a/urbanaccess/gtfs/headways.py +++ b/urbanaccess/gtfs/headways.py @@ -4,6 +4,7 @@ import logging as lg from urbanaccess.utils import log +from urbanaccess.gtfs.utils_validation import _check_time_range_format from urbanaccess.gtfs.network import _time_selector warnings.simplefilter(action="ignore", category=FutureWarning) @@ -68,26 +69,25 @@ def _headway_handler(interpolated_stop_times_df, trips_df, Parameters ---------- interpolated_stop_times_df : pandas.DataFrame - interpolated stop times dataframe for stop times within the time range + interpolated stop times DataFrame for stop times within the time range trips_df : pandas.DataFrame - trips dataframe + trips DataFrame routes_df : pandas.DataFrame - routes dataframe + routes DataFrame headway_timerange : list - time range for which to calculate headways between as a - list of time 1 and time 2 where times are 24 hour clock strings - such as: - ['07:00:00', '10:00:00'] + time range for which to calculate headways between in a list with time + 1 and time 2 as strings. Must follow format of a 24 hour clock for + example: 08:00:00 or 17:00:00 Returns ------- headway_by_routestop_df : pandas.DataFrame - dataframe of statistics of route stop headways in units of minutes + DataFrame of statistics of route stop headways in units of minutes with relevant route and stop information """ start_time = time.time() - # add unique trip and route id + # add unique trip and route ID trips_df['unique_trip_id'] = ( trips_df['trip_id'].str.cat( trips_df['unique_agency_id'].astype('str'), sep='_')) @@ -105,7 +105,7 @@ def _headway_handler(interpolated_stop_times_df, trips_df, trips_df = trips_df[columns] - # add unique route id + # add unique route ID routes_df['unique_route_id'] = ( routes_df['route_id'].str.cat( routes_df['unique_agency_id'].astype('str'), sep='_')) @@ -138,7 +138,7 @@ def _headway_handler(interpolated_stop_times_df, trips_df, headway_by_routestop_df['unique_stop_id'].str.cat( headway_by_routestop_df['unique_route_id'].astype('str'), sep='_')) - log('headway calculation complete. Took {:,.2f} seconds'.format( + log('Headway calculation complete. Took {:,.2f} seconds.'.format( time.time() - start_time)) return headway_by_routestop_df @@ -153,9 +153,9 @@ def headways(gtfsfeeds_df, headway_timerange): gtfsfeeds_df : object gtfsfeeds_dfs object with all processed GTFS data tables headway_timerange : list - time range for which to calculate headways between as a list of - time 1 and time 2 where times are 24 hour clock strings such as: - ['07:00:00', '10:00:00'] + time range for which to calculate headways between in a list with time + 1 and time 2 as strings. Must follow format of a 24 hour clock for + example: 08:00:00 or 17:00:00 Returns ------- @@ -164,39 +164,15 @@ def headways(gtfsfeeds_df, headway_timerange): route stop headways in units of minutes with relevant route and stop information """ - - time_error_statement = ( - '{} starttime and endtime are not in the correct format. ' - 'Format should be a 24 hour clock in following format: 08:00:00 ' - 'or 17:00:00'.format(headway_timerange)) - if not isinstance(headway_timerange, list) or len(headway_timerange) != 2: - raise ValueError('timerange must be a list of length 2') - if headway_timerange[0].split(':')[0] > headway_timerange[1].split(':')[0]: - raise ValueError('starttime is greater than endtime') - - for t in headway_timerange: - if not isinstance(t, str): - raise ValueError(time_error_statement) - if len(t) != 8: - raise ValueError(time_error_statement) - if int(headway_timerange[1].split(':')[0]) - int( - headway_timerange[0].split(':')[0]) > 3: - long_time_range_msg = ( - 'WARNING: Time range passed: {} is a {} hour period. Long periods ' - 'over 3 hours may take a significant amount of time to process.') - log(long_time_range_msg.format(headway_timerange, - int(str( - headway_timerange[1][0:2])) - int( - str(headway_timerange[0][0:2]))), - level=lg.WARNING) + _check_time_range_format(headway_timerange) if gtfsfeeds_df is None: - raise ValueError('gtfsfeeds_df cannot be None') + raise ValueError('gtfsfeeds_df cannot be None.') if gtfsfeeds_df.stop_times_int.empty or gtfsfeeds_df.trips.empty or \ gtfsfeeds_df.routes.empty: raise ValueError( - 'one of the gtfsfeeds_dfs objects: stop_times_int, trips, ' - 'or routes were found to be empty.') + 'One of the following gtfsfeeds_dfs objects: stop_times_int, ' + 'trips, or routes were found to be empty.') headways_df = _headway_handler( interpolated_stop_times_df=gtfsfeeds_df.stop_times_int, diff --git a/urbanaccess/gtfs/network.py b/urbanaccess/gtfs/network.py index 306958b..ba87c28 100644 --- a/urbanaccess/gtfs/network.py +++ b/urbanaccess/gtfs/network.py @@ -1,24 +1,29 @@ from __future__ import division +import os import pandas as pd import time import logging as lg from urbanaccess.utils import log, df_to_hdf5, hdf5_to_df +from urbanaccess.gtfs.utils_validation import _check_time_range_format from urbanaccess.network import ua_network from urbanaccess import config -from urbanaccess.gtfs.gtfsfeeds_dataframe import gtfsfeeds_dfs +from urbanaccess.gtfs.gtfsfeeds_dataframe import gtfsfeeds_dfs, \ + urbanaccess_gtfs_df pd.options.mode.chained_assignment = None -def create_transit_net(gtfsfeeds_dfs, day, - timerange, - calendar_dates_lookup=None, - overwrite_existing_stop_times_int=False, - use_existing_stop_times_int=False, - save_processed_gtfs=False, - save_dir=config.settings.data_folder, - save_filename=None): +def create_transit_net( + gtfsfeeds_dfs, + day, + timerange, + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=config.settings.data_folder, + save_filename=None): """ Create a travel time weight network graph in units of minutes from GTFS data @@ -26,16 +31,16 @@ def create_transit_net(gtfsfeeds_dfs, day, Parameters ---------- gtfsfeeds_dfs : object - gtfsfeeds_dfs object with DataFrames of stops, routes, trips, + urbanaccess_gtfs_df object with DataFrames of stops, routes, trips, stop_times, calendar, calendar_dates (optional) and stop_times_int (optional) - day : {'friday', 'monday', 'saturday', 'sunday', 'thursday', 'tuesday', - 'wednesday'} + day : {'monday', 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday'} day of the week to extract transit schedule from that corresponds to the day in the GTFS calendar timerange : list time range to extract transit schedule from in a list with time - 1 and time 2. it is suggested the time range + 1 and time 2 as strings. It is suggested the time range specified is large enough to allow for travel from one end of the transit network to the other but small enough to represent a relevant travel time period such as a 3 hour window @@ -59,11 +64,11 @@ def create_transit_net(gtfsfeeds_dfs, day, gtfsfeeds_dfs object it will be used instead of re-calculated save_processed_gtfs : bool, optional if true, all processed GTFS DataFrames will - be stored to disk in a hdf5 file + be stored to disk in a HDF5 file save_dir : str, optional - directory to save the hdf5 file + directory to save the HDF5 file save_filename : str, optional - name to save the hdf5 file as + name to save the HDF5 file as Returns ------- @@ -73,31 +78,11 @@ def create_transit_net(gtfsfeeds_dfs, day, """ start_time = time.time() - time_error_statement = ( - '{} starttime and endtime are not in the correct format. ' - 'Format should be a 24 hour clock in the following format: 08:00:00 ' - 'or 17:00:00'.format( - timerange)) - if not isinstance(timerange, list) or len(timerange) != 2: - raise ValueError(time_error_statement) - if timerange[0] > timerange[1]: - raise ValueError(time_error_statement) - for t in timerange: - if not isinstance(t, str): - raise ValueError(time_error_statement) - if len(t) != 8: - raise ValueError(time_error_statement) - if int(str(timerange[1][0:2])) - int(str(timerange[0][0:2])) > 3: - log( - 'WARNING: Time range passed: {} is a {} hour period. Long ' - 'periods over 3 hours may take a significant amount of time to ' - 'process.'.format( - timerange, - int(str(timerange[1][0:2])) - int(str(timerange[0][0:2]))), - level=lg.WARNING) - if gtfsfeeds_dfs is None: - raise ValueError('gtfsfeeds_dfs is None') - error_msg = ('one of the following gtfsfeeds_dfs objects {} were ' + _check_time_range_format(timerange) + if not isinstance(gtfsfeeds_dfs, urbanaccess_gtfs_df): + raise ValueError('gtfsfeeds_dfs must be an urbanaccess_gtfs_df ' + 'object.') + error_msg = ('One of the following gtfsfeeds_dfs objects: {} were ' 'found to be empty.') if gtfsfeeds_dfs.trips.empty or gtfsfeeds_dfs.stop_times.empty or \ gtfsfeeds_dfs.stops.empty: @@ -107,11 +92,14 @@ def create_transit_net(gtfsfeeds_dfs, day, error_msg_case_2 = 'calendar or calendar_dates' raise ValueError(error_msg.format(error_msg_case_2)) if not isinstance(overwrite_existing_stop_times_int, bool): - raise ValueError('overwrite_existing_stop_times_int must be bool') + raise ValueError('overwrite_existing_stop_times_int must be bool.') if not isinstance(use_existing_stop_times_int, bool): - raise ValueError('use_existing_stop_times_int must be bool') + raise ValueError('use_existing_stop_times_int must be bool.') if not isinstance(save_processed_gtfs, bool): - raise ValueError('save_processed_gtfs must be bool') + raise ValueError('save_processed_gtfs must be bool.') + if overwrite_existing_stop_times_int and use_existing_stop_times_int: + raise ValueError('overwrite_existing_stop_times_int and ' + 'use_existing_stop_times_int cannot both be True.') columns = ['route_id', 'direction_id', @@ -132,9 +120,13 @@ def create_transit_net(gtfsfeeds_dfs, day, day=day, calendar_dates_lookup=calendar_dates_lookup) + # proceed to calc stop_times_int if stop_times_int is already empty, or + # overwrite existing is True, or use existing is False if gtfsfeeds_dfs.stop_times_int.empty or \ - overwrite_existing_stop_times_int or use_existing_stop_times_int\ + overwrite_existing_stop_times_int or use_existing_stop_times_int \ is False: + if overwrite_existing_stop_times_int: + log(' Overwriting existing stop_times_int DataFrame...') gtfsfeeds_dfs.stop_times_int = _interpolate_stop_times( stop_times_df=gtfsfeeds_dfs.stop_times, calendar_selected_trips_df=calendar_selected_trips_df) @@ -147,10 +139,7 @@ def create_transit_net(gtfsfeeds_dfs, day, dir=save_dir, filename=save_filename) if use_existing_stop_times_int: - if gtfsfeeds_dfs.stop_times_int.empty: - raise ValueError('existing stop_times_int is empty. Set ' - 'use_existing_stop_times_int to False to create ' - 'it.') + log(' Using existing stop_times_int DataFrame...') selected_interpolated_stop_times_df = _time_selector( df=gtfsfeeds_dfs.stop_times_int, @@ -158,17 +147,12 @@ def create_transit_net(gtfsfeeds_dfs, day, endtime=timerange[1]) final_edge_table = _format_transit_net_edge( - stop_times_df=selected_interpolated_stop_times_df[['unique_trip_id', - 'stop_id', - 'unique_stop_id', - 'timediff', - 'stop_sequence', - 'unique_agency_id', - 'trip_id']]) - - transit_edges = _convert_imp_time_units(df=final_edge_table, - time_col='weight', - convert_to='minutes') + stop_times_df=selected_interpolated_stop_times_df[ + ['unique_trip_id', 'stop_id', 'unique_stop_id', 'timediff', + 'stop_sequence', 'unique_agency_id', 'trip_id']]) + + transit_edges = _convert_imp_time_units( + df=final_edge_table, time_col='weight', convert_to='minutes') final_selected_stops = _stops_in_edge_table_selector( input_stops_df=gtfsfeeds_dfs.stops, @@ -176,11 +160,11 @@ def create_transit_net(gtfsfeeds_dfs, day, transit_nodes = _format_transit_net_nodes(df=final_selected_stops) - transit_edges = _route_type_to_edge(transit_edge_df=transit_edges, - stop_time_df=gtfsfeeds_dfs.stop_times) + transit_edges = _route_type_to_edge( + transit_edge_df=transit_edges, stop_time_df=gtfsfeeds_dfs.stop_times) - transit_edges = _route_id_to_edge(transit_edge_df=transit_edges, - trips_df=gtfsfeeds_dfs.trips) + transit_edges = _route_id_to_edge( + transit_edge_df=transit_edges, trips_df=gtfsfeeds_dfs.trips) # assign node and edge net type transit_nodes['net_type'] = 'transit' @@ -190,7 +174,7 @@ def create_transit_net(gtfsfeeds_dfs, day, ua_network.transit_edges = transit_edges ua_network.transit_nodes = transit_nodes - log('Successfully created transit network. Took {:,.2f} seconds'.format( + log('Successfully created transit network. Took {:,.2f} seconds.'.format( time.time() - start_time)) return ua_network @@ -210,8 +194,8 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, calendar DataFrame input_calendar_dates_df : pandas.DataFrame calendar_dates DataFrame - day : {'friday', 'monday', 'saturday', 'sunday', 'thursday', 'tuesday', - 'wednesday'} + day : {'monday', 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday'} day of the week to extract transit schedule that corresponds to the day in the GTFS calendar calendar_dates_lookup : dict, optional @@ -230,39 +214,41 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, """ start_time = time.time() - valid_days = ['friday', 'monday', 'saturday', 'sunday', - 'thursday', 'tuesday', 'wednesday'] + valid_days = ['monday', 'tuesday', 'wednesday', 'thursday', + 'friday', 'saturday', 'sunday'] if day not in valid_days: + valid_days_str = str(valid_days).replace('[', '').replace(']', '') raise ValueError('Incorrect day specified. Must be one of lowercase ' - 'strings: friday, monday, saturday, sunday, ' - 'thursday, tuesday, wednesday.') + 'strings: {}.'.format(valid_days_str)) # check format of calendar_dates_lookup if calendar_dates_lookup is not None: if not isinstance(calendar_dates_lookup, dict): - raise ValueError('calendar_dates_lookup parameter is not a dict') + raise ValueError( + 'calendar_dates_lookup parameter must be a dictionary.') for key in calendar_dates_lookup.keys(): if not isinstance(key, str): - raise ValueError('calendar_dates_lookup key {} must be a ' - 'string'.format(key)) + raise ValueError('calendar_dates_lookup key: {} ' + 'must be a string.'.format(key)) if isinstance(calendar_dates_lookup[key], str): value = [calendar_dates_lookup[key]] else: if not isinstance(calendar_dates_lookup[key], list): raise ValueError( - 'calendar_dates_lookup value {} must be a string or a ' - 'list of strings'.format( + 'calendar_dates_lookup value: {} must be a string or ' + 'a list of strings.'.format( calendar_dates_lookup[key])) else: value = calendar_dates_lookup[key] for string in value: if not isinstance(string, str): - raise ValueError('{} must be a string'.format(value)) + raise ValueError('calendar_dates_lookup value: {} ' + 'must contain strings.'.format(value)) - # create unique service ids + # create unique service IDs df_list = [input_trips_df, input_calendar_df] # if input_calendar_dates_df is not empty then add it to processing if input_calendar_dates_df.empty is False: @@ -270,49 +256,48 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, for index, df in enumerate(df_list): df['unique_service_id'] = (df['service_id'].str.cat( - df['unique_agency_id'].astype('str'), - sep='_')) + df['unique_agency_id'].astype('str'), sep='_')) df_list[index] = df - # select service ids where day specified has a 1 = service runs on that day - log('Using calendar to extract service_ids to select trips.') + # select service IDs where day specified has a 1 = service runs on that day + log('Using calendar to extract service_ids to select trips...') input_calendar_df = input_calendar_df[(input_calendar_df[day] == 1)] input_calendar_df = input_calendar_df[['unique_service_id']] num_cal_service_ids_extracted = len(input_calendar_df) - log('{:,} service_ids were extracted from calendar'.format( + log('{:,} service_ids were extracted from calendar.'.format( num_cal_service_ids_extracted)) # generate information needed to tell user the status of their trips in # terms of service_ids in calendar and calendar_dates tables - trips_in_calendar = input_trips_df.loc[input_trips_df[ - 'unique_service_id'].isin( - input_calendar_df['unique_service_id'])] - trips_notin_calendar = input_trips_df.loc[~input_trips_df[ - 'unique_service_id'].isin(input_calendar_df['unique_service_id'])] + trips_in_calendar = input_trips_df.loc[ + input_trips_df['unique_service_id'].isin( + input_calendar_df['unique_service_id'])] + trips_notin_calendar = input_trips_df.loc[ + ~input_trips_df['unique_service_id'].isin( + input_calendar_df['unique_service_id'])] - pct_trips_in_calendar = round(len(trips_in_calendar) / len( + cnt_input_trips_df = len(input_trips_df) + cnt_trips_in_calendar = len(trips_in_calendar) + pct_trips_in_calendar = round(cnt_trips_in_calendar / len( input_trips_df) * 100, 2) feeds_wtrips_in_cal = trips_in_calendar['unique_feed_id'].unique() print_feed_ids = [' '.join(feed_id.split('_')[:-1]) for feed_id in feeds_wtrips_in_cal] feeds_wotrips_in_cal = trips_notin_calendar['unique_feed_id'].unique() - log( - '{:,} trip(s) {:.2f} percent of {:,} total trip records were ' - 'found in calendar for GTFS feed(s): {}'.format( - len(trips_in_calendar), - pct_trips_in_calendar, - len(input_trips_df), - print_feed_ids)) + if print_feed_ids: + log('{:,} trip(s) {:.2f} percent of {:,} total trip records were ' + 'found in calendar for GTFS feed(s): {}.'.format( + cnt_trips_in_calendar, pct_trips_in_calendar, cnt_input_trips_df, + print_feed_ids)) feed_id_not_in_cal = [x for x in feeds_wotrips_in_cal if x not in feeds_wtrips_in_cal] for feed_id in feed_id_not_in_cal: - log( - '0 trip(s) 0 percent of {:,} total trip records were ' - 'found in calendar for GTFS feed: {}'.format( - len(input_trips_df), - ' '.join(feed_id.split('_')[:-1]))) + trip_feed_name = ' '.join(feed_id.split('_')[:-1]) + log('0 trip(s) 0 percent of {:,} total trip records were ' + 'found in calendar for GTFS feed: {}.'.format( + cnt_input_trips_df, trip_feed_name)) if len(trips_notin_calendar) > 0 and calendar_dates_lookup is None: warning_msg = ( @@ -325,46 +310,41 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, 'in doubt do not use the calendar_dates_lookup parameter.') log(warning_msg, level=lg.WARNING) - if len(feeds_wtrips_in_cal) != len( - feeds_wotrips_in_cal) and calendar_dates_lookup is None: + if len(feeds_wtrips_in_cal) != len(feeds_wotrips_in_cal) and \ + calendar_dates_lookup is None: for feed_id in feeds_wotrips_in_cal: - log( - '{:,} trip(s) {:.2f} percent of {:,} total trip records were ' - 'not found in calendar for GTFS feed: {}'.format( - len(trips_in_calendar), - pct_trips_in_calendar, - len(input_trips_df), - ' '.join(feed_id.split('_')[:-1]))) + trip_feed_name = ' '.join(feed_id.split('_')[:-1]) + log('{:,} trip(s) {:.2f} percent of {:,} total trip records were ' + 'not found in calendar for GTFS feed: {}.'.format( + cnt_trips_in_calendar, pct_trips_in_calendar, + cnt_input_trips_df, trip_feed_name)) if feed_id not in feeds_wtrips_in_cal: log('Warning: GTFS feed: {} no trips were selected using ' 'calendar. It is suggested you use the ' - 'calendar_dates_lookup parameter to utilize this feeds ' - 'calendar_dates file.'.format( - ' '.join(feed_id.split('_')[:-1])), + 'calendar_dates_lookup parameter to utilize this feed\'s ' + 'calendar_dates file.'.format(trip_feed_name), level=lg.WARNING) # look for service_ids inside of calendar_dates if calendar does not # supply enough service_ids to select trips by if len(trips_notin_calendar) > 0 and calendar_dates_lookup is not None: - log('Using calendar_dates to supplement service_ids extracted from ' - 'calendar to select trips.') + 'calendar to select trips...') subset_result_df = pd.DataFrame() if input_calendar_dates_df.empty: - raise ValueError( - 'calendar_dates_df is empty. Unable to use the ' - 'calendar_dates_lookup parameter') + raise ValueError('calendar_dates_df is empty. Unable to use the ' + 'calendar_dates_lookup parameter.') for col_name_key, string_value in calendar_dates_lookup.items(): if col_name_key not in input_calendar_dates_df.columns: - raise ValueError('{} column not found in calendar_dates ' - 'dataframe'.format(col_name_key)) + raise ValueError('Column: {} not found in calendar_dates ' + 'dataframe.'.format(col_name_key)) if col_name_key not in input_calendar_dates_df.select_dtypes( include=[object]).columns: - raise ValueError('{} column is not object type'.format( + raise ValueError('Column: {} must be object type.'.format( col_name_key)) if not isinstance(string_value, list): @@ -372,21 +352,19 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, for text in string_value: # TODO: modify this in order to allow subset based on gtfs - # feed name or a or/and condition + # feed name or a or/and condition subset_result = input_calendar_dates_df[ input_calendar_dates_df[col_name_key].str.match( text, case=False, na=False)] - if len(subset_result) != 0: + cnt_subset_result = len(subset_result) + if cnt_subset_result != 0: feed_id_list = subset_result['unique_feed_id'].unique() for index, id in enumerate(feed_id_list): feed_id_list[index] = ' '.join(id.split('_')[:-1]) - log('Found {:,} records that matched query: column: {} ' - 'and string: {} for GTFS feed(s): {}'.format(len( - subset_result), - col_name_key, - text, - feed_id_list)) + log('Found {:,} record(s) that matched query: column: {} ' + 'and string: {} for GTFS feed(s): {}.'.format( + cnt_subset_result, col_name_key, text, feed_id_list)) subset_result_df = subset_result_df.append(subset_result) @@ -394,16 +372,16 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, subset_result_df = subset_result_df[['unique_service_id']] num_caldates_service_ids_extracted = len(subset_result_df) - log('An additional {:,} service_ids were extracted from ' - 'calendar_dates. Total service_ids extracted: {:,}'.format( - num_caldates_service_ids_extracted, - num_caldates_service_ids_extracted + - num_cal_service_ids_extracted)) + tot_service_ids_extracted = \ + num_caldates_service_ids_extracted + num_cal_service_ids_extracted + log('An additional {:,} service_id(s) were extracted from ' + 'calendar_dates. Total service_id(s) extracted: {:,}.'.format( + num_caldates_service_ids_extracted, tot_service_ids_extracted)) input_calendar_df = input_calendar_df.append(subset_result_df) input_calendar_df.drop_duplicates(inplace=True) - # select and create df of trips that match the service ids for the day of - # the week specified merge calendar df that has service ids for + # select and create df of trips that match the service IDs for the day of + # the week specified merge calendar df that has service IDs for # specified day with trips df calendar_selected_trips_df = input_trips_df.loc[ input_trips_df['unique_service_id'].isin( @@ -416,22 +394,18 @@ def _trip_schedule_selector(input_trips_df, input_calendar_df, calendar_selected_trips_df.reset_index(drop=True, inplace=True) calendar_selected_trips_df.drop('unique_service_id', axis=1, inplace=True) + calendar_selected_trips_count = len(calendar_selected_trips_df) if calendar_dates_lookup is None: log('{:,} of {:,} total trips were extracted representing calendar ' - 'day: {}. Took {:,.2f} seconds'.format(len( - calendar_selected_trips_df), - len(input_trips_df), - day, - time.time() - start_time)) + 'day: {}. Took {:,.2f} seconds.'.format( + calendar_selected_trips_count, cnt_input_trips_df, day, + time.time() - start_time)) else: log('{:,} of {:,} total trips were extracted representing calendar ' - 'day: {} and calendar_dates search parameters: {}. Took {:,' - '.2f} seconds'.format(len( - calendar_selected_trips_df), - len(input_trips_df), - day, - calendar_dates_lookup, - time.time() - start_time)) + 'day: {} and calendar_dates search parameters: {}. ' + 'Took {:,.2f} seconds.'.format( + calendar_selected_trips_count, cnt_input_trips_df, day, + calendar_dates_lookup, time.time() - start_time)) return calendar_selected_trips_df @@ -456,33 +430,31 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): start_time = time.time() - # create unique trip ids + # create unique trip IDs df_list = [calendar_selected_trips_df, stop_times_df] for index, df in enumerate(df_list): df['unique_trip_id'] = (df['trip_id'].str.cat( - df['unique_agency_id'].astype('str'), - sep='_')) + df['unique_agency_id'].astype('str'), sep='_')) df_list[index] = df # sort stop times inplace based on first to last stop in # sequence -- required as the linear interpolator runs # from first value to last value if stop_times_df['stop_sequence'].isnull().sum() > 1: - log('WARNING: There are {:,} ' - 'stop_sequence records missing in the stop_times DataFrame. ' - 'Please check these missing values. In order for interpolation ' - 'to proceed correctly, ' - 'all records must have a stop_sequence value.'.format( - stop_times_df['stop_sequence'].isnull().sum()), + log('WARNING: There are {:,} stop_sequence records missing in the ' + 'stop_times DataFrame. Please check these missing values. ' + 'In order for interpolation to proceed correctly, all records ' + 'must have a stop_sequence value.'.format( + stop_times_df['stop_sequence'].isnull().sum()), level=lg.WARNING) stop_times_df.sort_values(by=['unique_trip_id', 'stop_sequence'], inplace=True) - # make list of unique trip ids from the calendar_selected_trips_df + # make list of unique trip IDs from the calendar_selected_trips_df uniquetriplist = calendar_selected_trips_df[ 'unique_trip_id'].unique().tolist() - # select trip ids that match the trips in the + # select trip IDs that match the trips in the # calendar_selected_trips_df -- resulting df will be stop times # only for trips that run on the service day or dates of interest stop_times_df = stop_times_df[ @@ -501,30 +473,33 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): # if there are stop times missing that need interpolation notify user if missing_stop_times_count > 0: - log('Note: Processing may take a long time depending' - ' on the number of records. ' - 'Total unique trips to assess: {:,}'.format( - len(stop_times_df['unique_trip_id'].unique())), + log('Note: Processing may take a long time depending ' + 'on the number of records. ' + 'Total unique trips to assess: {:,}.'.format( + len(stop_times_df['unique_trip_id'].unique())), level=lg.WARNING) log('Starting departure stop time interpolation...') - log( - 'Departure time records missing from trips following the ' + log('Departure time records missing from trips following the ' 'specified schedule: {:,} ({:.2f} percent of {:,} total ' - 'records)'.format( - missing_stop_times_count, - (missing_stop_times_count / len(stop_times_df)) * 100, - len(stop_times_df['departure_time_sec']))) + 'records.)'.format( + missing_stop_times_count, + (missing_stop_times_count / len(stop_times_df)) * 100, + len(stop_times_df['departure_time_sec']))) log('Interpolating...') else: - log('There are no departure time records missing from trips ' 'following the specified schedule. There are no records to ' 'interpolate.') + # TODO: for the rare and unlikely case when there is 1 null record and + # its not the first or last stop in the stop sequence, that value + # should be interpolated and its trip ID should be added to those to be + # interpolated - this additional case would have to be benchmarked + # for speed to ensure it doesnt slow down existing process # Find trips with more than one missing time - # Note: all trip ids have at least 1 null departure time because the + # Note: all trip IDs have at least 1 null departure time because the # last stop in a trip is always null null_times = stop_times_df[stop_times_df.departure_time_sec.isnull()] trips_with_null = null_times.unique_trip_id.value_counts() @@ -536,16 +511,27 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): stop_times_df.unique_trip_id.isin(trips_with_more_than_one_null)] if len(df_for_interpolation) > 0: + # check for duplicate stop_sequence and unique_trip_id combination, + # if dups are found this will throw an error during the pivot() + # operation so catch and return to user instead + dup_df = df_for_interpolation[df_for_interpolation.duplicated( + subset=['stop_sequence', 'unique_trip_id'], keep='first')] + if len(dup_df) != 0: + dup_values = list(dup_df['unique_trip_id'].unique()) + raise ValueError('Found duplicate values when values from ' + 'stop_sequence and unique_trip_id are combined. ' + 'Check values in these columns for ' + 'trip_id(s): {}.'.format(dup_values)) # Pivot to DataFrame where each unique trip has its own column # Index is stop_sequence - pivot = df_for_interpolation.pivot(index='stop_sequence', - columns='unique_trip_id', - values='departure_time_sec') + pivot = df_for_interpolation.pivot( + index='stop_sequence', columns='unique_trip_id', + values='departure_time_sec') # Interpolate on the whole DataFrame at once - interpolator = pivot.interpolate(method='linear', axis=0, - limit_direction='forward') + interpolator = pivot.interpolate( + method='linear', axis=0, limit_direction='forward') # Melt back into stacked format interpolator['stop_sequence_merge'] = interpolator.index @@ -559,10 +545,10 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): lambda col: col.last_valid_index(), axis=0) last_valid_stop_df = last_valid_stop_series.to_frame('last_valid_stop') - df_for_interpolation = (df_for_interpolation - .merge(last_valid_stop_df, - left_on='unique_trip_id', - right_index=True)) + df_for_interpolation = ( + df_for_interpolation.merge( + last_valid_stop_df, left_on='unique_trip_id', + right_index=True)) trailing = (df_for_interpolation.stop_sequence > df_for_interpolation.last_valid_stop) @@ -571,12 +557,16 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): df_for_interpolation['stop_sequence_merge'] = ( df_for_interpolation[~trailing]['stop_sequence']) - # Need to check if existing index in column names and drop if so (else - # a ValueError where Pandas can't insert + # Need to check if existing index is in column names and drop if + # so (else a ValueError where Pandas can't insert # b/c col already exists will occur) drop_bool = False if _check_if_index_name_in_cols(df_for_interpolation): - # move the current index to own col named 'index' + # move the current index to its own col named 'index' + log('stop_times index name: {} is also a column name. ' + 'Index will be dropped for interpolation and re-created ' + 'afterwards to continue.'.format( + df_for_interpolation.index.name)) col_name_to_copy = df_for_interpolation.index.name col_to_copy = df_for_interpolation[col_name_to_copy].copy() df_for_interpolation['index'] = col_to_copy @@ -584,17 +574,20 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): df_for_interpolation.reset_index(inplace=True, drop=drop_bool) # Merge back into original index - interpolated_df = pd.merge(df_for_interpolation, melted, 'left', - on=['stop_sequence_merge', - 'unique_trip_id']) - interpolated_df.set_index('index', inplace=True) + interpolated_df = pd.merge( + df_for_interpolation, melted, how='left', + on=['stop_sequence_merge', 'unique_trip_id']) + + # set index back to what it was if it was removed above before merge + if drop_bool is False: + interpolated_df.set_index('index', inplace=True) + interpolated_times = ( interpolated_df[['departure_time_sec_interpolate']]) - final_stop_times_df = pd.merge(stop_times_df, interpolated_times, - how='left', left_index=True, - right_index=True, sort=False, - copy=False) + final_stop_times_df = pd.merge( + stop_times_df, interpolated_times, how='left', + left_index=True, right_index=True, sort=False, copy=False) else: final_stop_times_df = stop_times_df @@ -607,12 +600,14 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): final_stop_times_df['departure_time_sec_interpolate'].fillna( final_stop_times_df['departure_time_sec'], inplace=True) - # TODO: refine this count so it refers to only the data that matters num_not_interpolated = final_stop_times_df[ 'departure_time_sec_interpolate'].isnull().sum() if num_not_interpolated > 0: - log('WARNING: Number of records unable to interpolate: {:,}. ' - 'These records have been removed.'.format(num_not_interpolated), + log('WARNING: Number of stop_time records unable to interpolate: {:,}.' + ' These records likely had stops in either the start or ' + 'end sequence that did not have time information avaiable to ' + 'interpolate between. These records have been removed.'.format( + num_not_interpolated), level=lg.WARNING) # convert the interpolated times (float) to integer so all times are @@ -625,16 +620,14 @@ def _interpolate_stop_times(stop_times_df, calendar_selected_trips_df): final_stop_times_df['departure_time_sec_interpolate'] = \ final_stop_times_df['departure_time_sec_interpolate'].astype(int) - # add unique stop id + # add unique stop ID final_stop_times_df['unique_stop_id'] = ( final_stop_times_df['stop_id'].str.cat( final_stop_times_df['unique_agency_id'].astype('str'), sep='_')) if missing_stop_times_count > 0: - log( - 'Departure stop time interpolation complete. Took {:,' - '.2f} seconds'.format( - time.time() - start_time)) + log('Departure stop time interpolation complete. ' + 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return final_stop_times_df @@ -656,13 +649,11 @@ def _time_difference(stop_times_df): """ start_time = time.time() - # calculate difference between consecutive records grouping by trip id. + # calculate difference between consecutive records grouping by trip ID stop_times_df['timediff'] = stop_times_df.groupby('unique_trip_id')[ 'departure_time_sec_interpolate'].diff() - log( - 'Difference between stop times has been successfully calculated. ' - 'Took {:,.2f} seconds'.format( - time.time() - start_time)) + log('Difference between stop times has been successfully calculated. ' + 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return stop_times_df @@ -706,15 +697,16 @@ def _time_selector(df, starttime, endtime): # create df of stops times that are within the requested range selected_stop_timesdf = df[( - (starttime_sec < df["departure_time_sec_interpolate"]) & ( - df["departure_time_sec_interpolate"] < endtime_sec))] - - log( - 'Stop times from {} to {} successfully selected {:,} records out of ' - '{:,} total records ({:.2f} percent of total). Took {:,' - '.2f} seconds'.format( - starttime, endtime, len(selected_stop_timesdf), len(df), - (len(selected_stop_timesdf) / len(df)) * 100, + (starttime_sec < df["departure_time_sec_interpolate"]) & ( + df["departure_time_sec_interpolate"] < endtime_sec))] + + subset_df_count = len(selected_stop_timesdf) + df_count = len(df) + log('Stop times from {} to {} successfully selected {:,} records out of ' + '{:,} total records ({:.2f} percent of total). ' + 'Took {:,.2f} seconds.'.format( + starttime, endtime, subset_df_count, df_count, + (subset_df_count / df_count) * 100, time.time() - start_time)) return selected_stop_timesdf @@ -741,7 +733,7 @@ def _format_transit_net_edge(stop_times_df): log('Starting transformation process for {:,} ' 'total trips...'.format(len(stop_times_df['unique_trip_id'].unique()))) - # set columns for new df for data needed by pandana for edges + # set columns for new df for data needed by Pandana for edges merged_edge = [] stop_times_df.sort_values(by=['unique_trip_id', 'stop_sequence'], @@ -752,13 +744,13 @@ def _format_transit_net_edge(stop_times_df): "node_id_from": tmp_trip_df['unique_stop_id'].iloc[:-1].values, "node_id_to": tmp_trip_df['unique_stop_id'].iloc[1:].values, "weight": tmp_trip_df['timediff'].iloc[1:].values, - "unique_agency_id": tmp_trip_df['unique_agency_id'].iloc[ - 1:].values, - # set unique trip id without edge order to join other data later + "unique_agency_id": tmp_trip_df[ + 'unique_agency_id'].iloc[1:].values, + # set unique trip ID without edge order to join other data later "unique_trip_id": trip }) - # Set current trip id to edge id column adding edge order at + # Set current trip ID to edge ID column adding edge order at # end of string edge_df['sequence'] = (edge_df.index + 1).astype(int) @@ -766,15 +758,14 @@ def _format_transit_net_edge(stop_times_df): merged_edge.append(edge_df) merged_edge_df = pd.concat(merged_edge, ignore_index=True) - merged_edge_df['sequence'] = merged_edge_df['sequence'].astype(int, - copy=False) + merged_edge_df['sequence'] = merged_edge_df['sequence'].astype( + int, copy=False) merged_edge_df['id'] = ( merged_edge_df['unique_trip_id'].str.cat( merged_edge_df['sequence'].astype('str'), sep='_')) - log('stop time table transformation to ' - 'Pandana format edge table completed. ' - 'Took {:,.2f} seconds'.format(time.time() - start_time)) + log('Stop time table transformation to Pandana format edge table ' + 'completed. Took {:,.2f} seconds.'.format(time.time() - start_time)) return merged_edge_df @@ -799,7 +790,7 @@ def _convert_imp_time_units(df, time_col='weight', convert_to='minutes'): """ valid_convert_to = ['seconds', 'minutes'] if convert_to not in valid_convert_to or not isinstance(convert_to, str): - raise ValueError('{} not a valid value or not a string'.format( + raise ValueError('{} is not a valid value or is not a string.'.format( convert_to)) if convert_to == 'seconds': @@ -815,8 +806,7 @@ def _convert_imp_time_units(df, time_col='weight', convert_to='minutes'): return df -def _stops_in_edge_table_selector(input_stops_df, - input_stop_times_df): +def _stops_in_edge_table_selector(input_stops_df, input_stop_times_df): """ Select stops that are active during the day and time period specified @@ -834,22 +824,21 @@ def _stops_in_edge_table_selector(input_stops_df, """ start_time = time.time() - # add unique stop id + # add unique stop ID input_stops_df['unique_stop_id'] = ( input_stops_df['stop_id'].str.cat( input_stops_df['unique_agency_id'].astype('str'), sep='_')) - # Select stop ids that match stop ids in the subset stop time data that + # Select stop IDs that match stop IDs in the subset stop time data that # match day and time selection selected_stops_df = input_stops_df.loc[ input_stops_df['unique_stop_id'].isin( input_stop_times_df['unique_stop_id'])] - log( - '{:,} of {:,} records selected from stops. Took {:,' - '.2f} seconds'.format( - len(selected_stops_df), len(input_stops_df), - time.time() - start_time)) + log('{:,} of {:,} records selected from stops. ' + 'Took {:,.2f} seconds.'.format( + len(selected_stops_df), len(input_stops_df), + time.time() - start_time)) return selected_stops_df @@ -870,7 +859,7 @@ def _format_transit_net_nodes(df): """ start_time = time.time() - # add unique stop id + # add unique stop ID if 'unique_stop_id' not in df.columns: df['unique_stop_id'] = ( df['stop_id'].str.cat( @@ -891,13 +880,11 @@ def _format_transit_net_nodes(df): col_list.append(item) final_node_df = pd.concat([final_node_df, df[col_list]], axis=1) - # set node index to be unique stop id + # set node index to be unique stop ID final_node_df = final_node_df.set_index('node_id') - log( - 'stop time table transformation to Pandana format node table ' - 'completed. Took {:,.2f} seconds'.format( - time.time() - start_time)) + log('Stop time table transformation to Pandana format node table ' + 'completed. Took {:,.2f} seconds.'.format(time.time() - start_time)) return final_node_df @@ -920,37 +907,33 @@ def _route_type_to_edge(transit_edge_df, stop_time_df): """ start_time = time.time() - # create unique trip ids + # create unique trip IDs stop_time_df['unique_trip_id'] = ( stop_time_df['trip_id'].str.cat( stop_time_df['unique_agency_id'].astype('str'), sep='_')) # join route_id to the edge table - merged_df = pd.merge(transit_edge_df, - stop_time_df[['unique_trip_id', 'route_type']], - how='left', on='unique_trip_id', sort=False, - copy=False) - merged_df.drop_duplicates(subset='unique_trip_id', - keep='first', - inplace=True) + merged_df = pd.merge( + transit_edge_df, stop_time_df[['unique_trip_id', 'route_type']], + how='left', on='unique_trip_id', sort=False, copy=False) + merged_df.drop_duplicates( + subset='unique_trip_id', keep='first', inplace=True) # need to get unique records here to have a one to one join - # this serves as the look up table # join the look up table created above to the table of interest - transit_edge_df_w_routetype = pd.merge(transit_edge_df, merged_df[ - ['route_type', 'unique_trip_id']], how='left', on='unique_trip_id', - sort=False, copy=False) + transit_edge_df_w_routetype = pd.merge( + transit_edge_df, merged_df[['route_type', 'unique_trip_id']], + how='left', on='unique_trip_id', sort=False, copy=False) - log( - 'route type successfully joined to transit edges. Took {:,' - '.2f} seconds'.format( - time.time() - start_time)) + log('Route type successfully joined to transit edges. ' + 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return transit_edge_df_w_routetype def _route_id_to_edge(transit_edge_df, trips_df): """ - Append route ids to transit edge table + Append route IDs to transit edge table Parameters ---------- @@ -967,7 +950,7 @@ def _route_id_to_edge(transit_edge_df, trips_df): start_time = time.time() if 'unique_route_id' not in transit_edge_df.columns: - # create unique trip and route ids + # create unique trip and route IDs trips_df['unique_trip_id'] = ( trips_df['trip_id'].str.cat( trips_df['unique_agency_id'].astype('str'), sep='_')) @@ -975,38 +958,44 @@ def _route_id_to_edge(transit_edge_df, trips_df): trips_df['route_id'].str.cat( trips_df['unique_agency_id'].astype('str'), sep='_')) - transit_edge_df_with_routes = pd.merge(transit_edge_df, trips_df[ - ['unique_trip_id', 'unique_route_id']], - how='left', - on='unique_trip_id', sort=False, - copy=False) + transit_edge_df_with_routes = pd.merge( + transit_edge_df, trips_df[['unique_trip_id', 'unique_route_id']], + how='left', on='unique_trip_id', sort=False, copy=False) - log( - 'route id successfully joined to transit edges. Took {:,' - '.2f} seconds'.format( - time.time() - start_time)) + log('Route ID successfully joined to transit edges. ' + 'Took {:,.2f} seconds.'.format(time.time() - start_time)) return transit_edge_df_with_routes -def edge_impedance_by_route_type(transit_edge_df, - street_level_rail=None, - underground_rail=None, - intercity_rail=None, - bus=None, - ferry=None, - cable_car=None, - gondola=None, - funicular=None): +def edge_impedance_by_route_type( + transit_edge_df, + travel_time_col_name='weight', + street_level_rail=None, + underground_rail=None, + intercity_rail=None, + bus=None, + ferry=None, + cable_car=None, + gondola=None, + funicular=None, + trolleybus=None, + monorail=None +): """ Penalize transit edge travel time based on transit mode type Parameters ---------- transit_edge_df : pandas.DataFrame - transit edge dataframe + transit edge DataFrame + travel_time_col_name : str, optional + name of travel time column to apply multiplier factor, + default column name is 'weight' street_level_rail : float, optional factor between -1 to 1 to multiply against travel time + underground_rail : float, optional + factor between -1 to 1 to multiply against travel time intercity_rail : float, optional factor between -1 to 1 to multiply against travel time bus : float, optional @@ -1019,225 +1008,221 @@ def edge_impedance_by_route_type(transit_edge_df, factor between -1 to 1 to multiply against travel time funicular : float, optional factor between -1 to 1 to multiply against travel time + trolleybus : float, optional + factor between -1 to 1 to multiply against travel time + monorail : float, optional + factor between -1 to 1 to multiply against travel time Returns ------- - ua_network : object - ua_network.transit_edges : pandas.DataFrame - + transit_edge_df : pandas.DataFrame + Returns transit_edge_df with travel_time_col_name column weighted by + specified coefficients by route type """ - if 'route_type' not in transit_edge_df.columns: - raise ValueError('No route_type column was found in dataframe') + req_cols = [travel_time_col_name, 'route_type'] + if not isinstance(travel_time_col_name, str): + raise ValueError('travel_time_col_name must be a string.') + for col in req_cols: + if col in transit_edge_df.columns: + if not pd.api.types.is_numeric_dtype(transit_edge_df[col]): + raise ValueError('{} must be a number.'.format(col)) + else: + raise ValueError('Column: {} was not found in transit_edge_df ' + 'DataFrame and is required.'.format(col)) # check count of records for each route type - route_type_desc = {0: 'Street Level Rail: Tram Streetcar Light rail', - 1: 'Underground rail: Subway or Metro', - 2: 'Rail: intercity or long-distance ', 3: 'Bus', - 4: 'Ferry', 5: 'Cable Car', - 6: 'Gondola or Suspended cable car', - 7: 'Steep incline: Funicular'} - log('Route type distribution as percentage of transit mode: {:.2f}'.format( - transit_edge_df['route_type'].map(route_type_desc.get).value_counts( - normalize=True, dropna=False) * 100)) - - var_list = [street_level_rail, underground_rail, intercity_rail, bus, - ferry, cable_car, gondola, funicular] - - for var in var_list: - if var is not None: - if not isinstance(var, float): - raise ValueError('One or more variables are not float') - - travel_time_col_name = 'weight' + # route types taken from 'route_type' definition on route.txt GTFS file: + # https://developers.google.com/transit/gtfs/reference#routestxt + route_type_dict = { + 0: {'name': 'Street Level Rail: Tram, Streetcar, or Light rail', + 'multiplier': street_level_rail}, + 1: {'name': 'Underground rail: Subway or Metro', + 'multiplier': underground_rail}, + 2: {'name': 'Rail: intercity or long-distance ', + 'multiplier': intercity_rail}, + 3: {'name': 'Bus', + 'multiplier': bus}, + 4: {'name': 'Ferry', + 'multiplier': ferry}, + 5: {'name': 'Cable tram or car', + 'multiplier': cable_car}, + 6: {'name': 'Aerial lift: Gondola or Suspended cable car', + 'multiplier': gondola}, + 7: {'name': 'Steep incline: Funicular', + 'multiplier': funicular}, + 11: {'name': 'Trolleybus', + 'multiplier': trolleybus}, + 12: {'name': 'Monorail', + 'multiplier': monorail}} + # create the dict to pass to value_counts() + route_type_desc = route_type_dict.copy() + for key, val in route_type_dict.items(): + route_type_desc[key] = val['name'] + + log('Route type distribution as percentage of transit mode:') + summary_stat = transit_edge_df['route_type'].map( + route_type_desc.get).value_counts(normalize=True, dropna=False) * 100 + log(summary_stat) + travel_time_col = transit_edge_df[travel_time_col_name] - if street_level_rail is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 0]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 0] = travel_time_col + ( - travel_time_col * street_level_rail) - log( - 'Adjusted Street Level Rail transit edge impedance based on mode' - ' type penalty coefficient: {}'.format( - street_level_rail)) - if underground_rail is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 1]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 1] = travel_time_col + ( - travel_time_col * underground_rail) - log( - 'Adjusted Underground rail transit edge impedance based on mode ' - 'type penalty coefficient: {}'.format( - underground_rail)) - if intercity_rail is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 2]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 2] = travel_time_col + ( - travel_time_col * intercity_rail) - log( - 'Adjusted Rail transit edge impedance based on mode type penalty ' - 'coefficient: {}'.format( - intercity_rail)) - if bus is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 3]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 3] = travel_time_col + ( - travel_time_col * bus) - log( - 'Adjusted Bus transit edge impedance based on mode type penalty ' - 'coefficient: {}'.format( - bus)) - if ferry is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 4]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 4] = travel_time_col + ( - travel_time_col * ferry) - log( - 'Adjusted Ferry transit edge impedance based on mode type ' - 'penalty coefficient: {}'.format( - ferry)) - if cable_car is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 5]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 5] = travel_time_col + ( - travel_time_col * cable_car) - log( - 'Adjusted Cable Car transit edge impedance based on mode type ' - 'penalty coefficient: {}'.format( - cable_car)) - if gondola is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 6]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 6] = travel_time_col + ( - travel_time_col * gondola) - log( - 'Adjusted Gondola or Suspended cable car transit edge impedance ' - 'based on mode type penalty coefficient: {}'.format( - gondola)) - if funicular is not None and len( - transit_edge_df[transit_edge_df['route_type'] == 7]) > 0: - transit_edge_df[travel_time_col_name][ - transit_edge_df['route_type'] == 7] = travel_time_col + ( - travel_time_col * funicular) - log( - 'Adjusted Funicular transit edge impedance based on mode type ' - 'penalty coefficient: {}'.format( - funicular)) - - ua_network.transit_edges = transit_edge_df - - log('Transit edge impedance mode type penalty calculation complete') + for route_type, route_vals in route_type_dict.items(): + if route_vals['multiplier'] is not None: + if not isinstance(route_vals['multiplier'], float): + raise ValueError('One or more multiplier variables are not ' + 'float.') - return ua_network + # warn if multiplier is not within optimal range + if not -1 <= route_vals['multiplier'] <= 1: + log('WARNING: Multiplier value of: {} should be a ' + 'value between -1 and 1.'.format(route_vals['multiplier']), + level=lg.WARNING) + route_type_cnt = len( + transit_edge_df[transit_edge_df['route_type'] == route_type]) + + # warn if route type is not found in DataFrame + if route_type_cnt == 0 and route_vals['multiplier'] is not None: + log('WARNING: Route type: {} with specified multiplier value ' + 'of: {} was not found in the specified edge ' + 'DataFrame.'.format( + route_vals['name'], route_vals['multiplier']), + level=lg.WARNING) + + if route_type_cnt > 0: + transit_edge_df[travel_time_col_name][ + transit_edge_df['route_type'] == route_type] = \ + travel_time_col + ( + travel_time_col * route_vals['multiplier']) + log('Adjusted {} transit edge impedance based on mode ' + 'type penalty coefficient: {}.'.format( + route_vals['name'], route_vals['multiplier'])) + + log('Transit edge impedance mode type penalty calculation complete.') + return transit_edge_df -def save_processed_gtfs_data(gtfsfeeds_dfs, - filename, - dir=config.settings.data_folder): +def save_processed_gtfs_data( + gtfsfeeds_dfs, filename, dir=config.settings.data_folder): """ - Write dataframes in a gtfsfeeds_dfs object to a hdf5 file + Write dataframes in an urbanaccess_gtfs_df object to a HDF5 file Parameters ---------- gtfsfeeds_dfs : object - gtfsfeeds_dfs object + urbanaccess_gtfs_df object filename : string - name of the hdf5 file to save with .h5 extension + name of the HDF5 file to save with .h5 extension dir : string, optional - directory to save hdf5 file + directory to save HDF5 file Returns ------- None """ - # TODO: refactor check below to use any() for readability - if gtfsfeeds_dfs is None or gtfsfeeds_dfs.stops.empty or \ - gtfsfeeds_dfs.routes.empty or gtfsfeeds_dfs.trips.empty \ - or gtfsfeeds_dfs.stop_times.empty or \ - gtfsfeeds_dfs.calendar.empty or \ - gtfsfeeds_dfs.stop_times_int.empty: - raise ValueError('gtfsfeeds_dfs is missing one of the required ' - 'dataframes.') - - df_to_hdf5(data=gtfsfeeds_dfs.stops, key='stops', overwrite_key=False, - dir=dir, filename=filename, overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.routes, key='routes', overwrite_key=False, - dir=dir, filename=filename, overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.trips, key='trips', overwrite_key=False, - dir=dir, filename=filename, overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.stop_times, key='stop_times', - overwrite_key=False, dir=dir, filename=filename, - overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.calendar, key='calendar', - overwrite_key=False, dir=dir, filename=filename, - overwrite_hdf5=False) - df_to_hdf5(data=gtfsfeeds_dfs.stop_times_int, key='stop_times_int', - overwrite_key=False, dir=dir, filename=filename, - overwrite_hdf5=False) - - if gtfsfeeds_dfs.headways.empty is False: - df_to_hdf5(data=gtfsfeeds_dfs.headways, key='headways', - overwrite_key=False, dir=dir, filename=filename, - overwrite_hdf5=False) + log('Writing HDF5 store...') + if not isinstance(gtfsfeeds_dfs, urbanaccess_gtfs_df): + raise ValueError('gtfsfeeds_dfs must be an urbanaccess_gtfs_df ' + 'object.') + + req_df_dict = {'stops': gtfsfeeds_dfs.stops, + 'routes': gtfsfeeds_dfs.routes, + 'trips': gtfsfeeds_dfs.trips, + 'stop_times': gtfsfeeds_dfs.stop_times, + 'stop_times_int': gtfsfeeds_dfs.stop_times_int} + # calendar or calendar_dates are required but not both + optional_df_dict = {'headways': gtfsfeeds_dfs.headways, + 'calendar': gtfsfeeds_dfs.calendar, + 'calendar_dates': gtfsfeeds_dfs.calendar_dates} + + for name, gtfs_df in req_df_dict.items(): + if gtfs_df.empty: + raise ValueError('gtfsfeeds_dfs is missing required ' + 'DataFrame: {}.'.format(name)) + if gtfsfeeds_dfs.calendar.empty and gtfsfeeds_dfs.calendar_dates.empty: + raise ValueError('gtfsfeeds_dfs is missing either the calendar or ' + 'calendar_dates DataFrame.') - if gtfsfeeds_dfs.calendar_dates.empty is False: - df_to_hdf5(data=gtfsfeeds_dfs.calendar_dates, key='calendar_dates', + tables_saved = [] + for name, gtfs_df in req_df_dict.items(): + df_to_hdf5(data=gtfs_df, key=name, overwrite_key=False, dir=dir, filename=filename, overwrite_hdf5=False) + tables_saved.extend([name]) + + for name, gtfs_df in optional_df_dict.items(): + if gtfs_df.empty is False: + df_to_hdf5(data=gtfs_df, key=name, + overwrite_key=False, dir=dir, filename=filename, + overwrite_hdf5=False) + tables_saved.extend([name]) + + log('Saved HDF5 store: {} with tables: {}.'.format( + os.path.join(dir, filename), tables_saved)) def load_processed_gtfs_data(filename, dir=config.settings.data_folder): """ - Read data from a hdf5 file to a gtfsfeeds_dfs object + Read data from a HDF5 file to an urbanaccess_gtfs_df object Parameters ---------- filename : string - name of the hdf5 file to read with .h5 extension + name of the HDF5 file to read with .h5 extension dir : string, optional - directory to read hdf5 file + directory to read HDF5 file Returns ------- gtfsfeeds_dfs : object + urbanaccess_gtfs_df object """ - gtfsfeeds_dfs.stops = hdf5_to_df(dir=dir, filename=filename, key='stops') - gtfsfeeds_dfs.routes = hdf5_to_df(dir=dir, filename=filename, key='routes') - gtfsfeeds_dfs.trips = hdf5_to_df(dir=dir, filename=filename, key='trips') - gtfsfeeds_dfs.stop_times = hdf5_to_df(dir=dir, filename=filename, - key='stop_times') - gtfsfeeds_dfs.calendar = hdf5_to_df(dir=dir, filename=filename, - key='calendar') - gtfsfeeds_dfs.stop_times_int = hdf5_to_df(dir=dir, filename=filename, - key='stop_times_int') - - hdf5_load_path = '{}/{}'.format(dir, filename) + log('Loading HDF5 store...') + req_df_dict = {'stops': gtfsfeeds_dfs.stops, + 'routes': gtfsfeeds_dfs.routes, + 'trips': gtfsfeeds_dfs.trips, + 'stop_times': gtfsfeeds_dfs.stop_times, + 'stop_times_int': gtfsfeeds_dfs.stop_times_int} + # calendar or calendar_dates are required but not both + optional_df_dict = {'headways': gtfsfeeds_dfs.headways, + 'calendar': gtfsfeeds_dfs.calendar, + 'calendar_dates': gtfsfeeds_dfs.calendar_dates} + + tables_read = [] + for name, gtfs_df in req_df_dict.items(): + vars(gtfsfeeds_dfs)[name] = hdf5_to_df( + dir=dir, filename=filename, key=name) + tables_read.extend([name]) + + # open HDF5 to read keys + hdf5_load_path = os.path.join(dir, filename) with pd.HDFStore(hdf5_load_path) as store: - - if 'headways' in store.keys(): - gtfsfeeds_dfs.headways = hdf5_to_df(dir=dir, - filename=filename, - key='headways') - if 'calendar_dates' in store.keys(): - gtfsfeeds_dfs.calendar_dates = hdf5_to_df(dir=dir, - filename=filename, - key='calendar_dates') + hdf5_keys = store.keys() + hdf5_keys = [item.replace('/', '') for item in hdf5_keys] + for name, gtfs_df in optional_df_dict.items(): + # if optional key exists, read it + if name in hdf5_keys: + vars(gtfsfeeds_dfs)[name] = hdf5_to_df( + dir=dir, filename=filename, key=name) + tables_read.extend([name]) + log('Read HDF5 store: {} tables: {}.'.format( + hdf5_load_path, tables_read)) return gtfsfeeds_dfs def _check_if_index_name_in_cols(df): """ - Check if existing index is in the passed dataframe list of column names + Check if specified Dataframe has an index name that is also a column name Parameters ---------- df : pandas.DataFrame - interpolated stop_time dataframe + Dataframe to check index and columns Returns ------- - iname : tuple + iname : boolean + True if index name is also a column name, else False """ cols = df.columns.values iname = df.index.name diff --git a/urbanaccess/gtfs/utils_format.py b/urbanaccess/gtfs/utils_format.py index d285d38..638a001 100644 --- a/urbanaccess/gtfs/utils_format.py +++ b/urbanaccess/gtfs/utils_format.py @@ -125,7 +125,7 @@ def _read_gtfs_trips(textfile_path, textfile): 'service_id': object, 'route_id': object, 7: object}, low_memory=False) - # 7 is placeholder for shape id which may not exist in some txt files + # 7 is placeholder for shape ID which may not exist in some txt files if len(df) == 0: raise ValueError('{} has no records'.format(os.path.join( textfile_path, textfile))) @@ -249,7 +249,7 @@ def _read_gtfs_calendar_dates(textfile_path, textfile): def _calendar_dates_agencyid(calendar_dates_df, routes_df, trips_df, agency_df, feed_folder): """ - Assign unique agency id to calendar dates dataframe + Assign unique agency ID to calendar dates dataframe Parameters ---------- @@ -312,7 +312,7 @@ def _calendar_dates_agencyid(calendar_dates_df, routes_df, def _calendar_agencyid(calendar_df, routes_df, trips_df, agency_df, feed_folder): """ - Assign unique agency id to calendar dataframe + Assign unique agency ID to calendar dataframe Parameters ---------- @@ -335,7 +335,7 @@ def _calendar_agencyid(calendar_df, routes_df, trips_df, sort=False, copy=False) tmp2 = pd.merge(trips_df, tmp1, how='left', on='route_id', sort=False, copy=False) - # do another merge to account for service ids that may not be utilized + # do another merge to account for service IDs that may not be utilized # across all GTFS files for accounting purposes so we keep those that # dont show up after merge merged_df = pd.merge(calendar_df[['service_id']], tmp2, how='left', @@ -377,7 +377,7 @@ def _calendar_agencyid(calendar_df, routes_df, trips_df, def _trips_agencyid(trips_df, routes_df, agency_df): """ - Assign unique agency id to trips dataframe + Assign unique agency ID to trips dataframe Parameters ---------- @@ -409,7 +409,7 @@ def _trips_agencyid(trips_df, routes_df, agency_df): def _stops_agencyid(stops_df, trips_df, routes_df, stop_times_df, agency_df, feed_folder): """ - Assign unique agency id to stops dataframe + Assign unique agency ID to stops dataframe Parameters ---------- @@ -475,7 +475,7 @@ def _stops_agencyid(stops_df, trips_df, routes_df, def _routes_agencyid(routes_df, agency_df): """ - Assign unique agency id to routes dataframe + Assign unique agency ID to routes dataframe Parameters ---------- @@ -503,7 +503,7 @@ def _routes_agencyid(routes_df, agency_df): def _stop_times_agencyid(stop_times_df, routes_df, trips_df, agency_df): """ - Assign unique agency id to stop times dataframe + Assign unique agency ID to stop times dataframe Parameters ---------- @@ -541,7 +541,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, trips_df, stop_times_df, calendar_df, feed_folder, calendar_dates_df, nulls_as_folder=True): """ - Create a unique agency id for all gtfs feed dataframes to enable unique + Create an unique agency ID for all gtfs feed dataframes to enable unique relational table keys Parameters @@ -563,8 +563,8 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, calendar_dates_df : pandas:DataFrame calendar dates dataframe nulls_as_folder : bool, optional - if true, gtfs feeds where the agency id is null, the gtfs folder - name will be used as the unique agency id + if true, gtfs feeds where the agency ID is null, the gtfs folder + name will be used as the unique agency ID Returns ------- stops_df, routes_df, trips_df, stop_times_df, calendar_df, @@ -589,7 +589,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, df_list[index] = df log('The agency.txt or agency_id column was not found. The unique ' - 'agency id: {} was generated using the name of the folder ' + 'agency ID: {} was generated using the name of the folder ' 'containing the GTFS feed text files.'.format( unique_agency_id)) @@ -623,7 +623,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, df['unique_agency_id'] = unique_agency_id df_list[index] = df log( - 'The unique agency id: {} was generated using the name of ' + 'The unique agency ID: {} was generated using the name of ' 'the agency in the agency.txt file.'.format( unique_agency_id)) @@ -693,7 +693,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, log( 'agency.txt agency_name column has more than one agency name ' - 'listed. Unique agency id was assigned using the agency id ' + 'listed. Unique agency ID was assigned using the agency ID ' 'and associated agency name.') for index, df in enumerate(df_list): @@ -706,7 +706,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, inplace=True) log( 'There are {:,} null values ({:,.2f}% of {:,} total) without ' - 'a unique agency id. These records will be labeled as ' + 'a unique agency ID. These records will be labeled as ' 'multiple_operators_ with the GTFS file folder ' 'name'.format(df['unique_agency_id'].isnull().sum(), len(df), @@ -718,7 +718,7 @@ def _add_unique_agencyid(agency_df, stops_df, routes_df, if calendar_dates_df.empty: df_list.extend([calendar_dates_df]) - log('Unique agency id operation complete. Took {:,.2f} seconds'.format( + log('Unique agency ID operation complete. Took {:,.2f} seconds'.format( time.time() - start_time)) return df_list @@ -727,7 +727,7 @@ def _add_unique_gtfsfeed_id(stops_df, routes_df, trips_df, stop_times_df, calendar_df, calendar_dates_df, feed_folder, feed_number): """ - Create a unique GTFS feed specific id for all gtfs feed dataframes to + Create an unique GTFS feed specific ID for all gtfs feed dataframes to enable tracking of specific feeds Parameters @@ -776,7 +776,7 @@ def _add_unique_gtfsfeed_id(stops_df, routes_df, trips_df, if calendar_dates_df.empty: df_list.extend([calendar_dates_df]) - log('Unique GTFS feed id operation complete. Took {:,.2f} seconds'.format( + log('Unique GTFS feed ID operation complete. Took {:,.2f} seconds'.format( time.time() - start_time)) return df_list @@ -1069,7 +1069,7 @@ def _append_route_type(stops_df, stop_times_df, routes_df, def _generate_unique_agency_id(df, col_name): """ - Generate unique agency id + Generate unique agency ID Parameters ---------- @@ -1092,7 +1092,7 @@ def _generate_unique_agency_id(df, col_name): def _generate_unique_feed_id(feed_folder): """ - Generate unique feed id + Generate unique feed ID Parameters ---------- diff --git a/urbanaccess/gtfs/utils_validation.py b/urbanaccess/gtfs/utils_validation.py index b31c4f1..2d4bf2e 100644 --- a/urbanaccess/gtfs/utils_validation.py +++ b/urbanaccess/gtfs/utils_validation.py @@ -175,3 +175,40 @@ def _validate_gtfs(stops_df, feed_folder, _checkcoordinates(df=stops_df, feed_folder=feed_folder) return stops_df + + +def _check_time_range_format(timerange): + """ + Check time range value format for expected schema + + Parameters + ---------- + timerange : list + time range as a list with time 1 and time 2 as strings. + Must follow format of a 24 hour clock for example: + 08:00:00 or 17:00:00 + + Returns + ------- + None + """ + time_error_statement = ( + '{} starttime and endtime are not in the correct format. ' + 'Format should be a 24 hour clock in the following format: 08:00:00 ' + 'or 17:00:00.'.format(timerange)) + if not isinstance(timerange, list) or len(timerange) != 2: + raise ValueError(time_error_statement) + if timerange[0] > timerange[1]: + raise ValueError(time_error_statement) + for t in timerange: + if not isinstance(t, str): + raise ValueError(time_error_statement) + if len(t) != 8: + raise ValueError(time_error_statement) + timerange_hr_1 = int(str(timerange[0][0:2])) + timerange_hr_2 = int(str(timerange[1][0:2])) + if timerange_hr_2 - timerange_hr_1 > 3: + log('WARNING: Time range passed: {} is a {} hour period. Long ' + 'periods over 3 hours may take a significant amount of time to ' + 'process.'.format(timerange, timerange_hr_2 - timerange_hr_1), + level=lg.WARNING) diff --git a/urbanaccess/gtfsfeeds.py b/urbanaccess/gtfsfeeds.py index 26acb02..53a047d 100644 --- a/urbanaccess/gtfsfeeds.py +++ b/urbanaccess/gtfsfeeds.py @@ -39,7 +39,7 @@ def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder, 'gtfsfeeds'), yamlname='gtfsfeeds.yaml'): """ - Create a urbanaccess_gtfsfeeds instance from a saved YAML. + Create an urbanaccess_gtfsfeeds instance from a saved YAML. Parameters ---------- @@ -63,7 +63,7 @@ def from_yaml(cls, gtfsfeeddir=os.path.join(config.settings.data_folder, yaml_file = os.path.join(gtfsfeeddir, yamlname) with open(yaml_file, 'r') as f: - yaml_config = yaml.load(f) + yaml_config = yaml.safe_load(f) if not isinstance(yaml_config, dict): raise ValueError('{} yamlname is not a dict'.format(yamlname)) @@ -206,7 +206,7 @@ def to_yaml(self, gtfsfeeddir=os.path.join(config.settings.data_folder, yamlname='gtfsfeeds.yaml', overwrite=False): """ - Save a urbanaccess_gtfsfeeds representation to a YAML file. + Save an urbanaccess_gtfsfeeds representation to a YAML file. Parameters ---------- diff --git a/urbanaccess/network.py b/urbanaccess/network.py index 849afdc..1ec6f53 100644 --- a/urbanaccess/network.py +++ b/urbanaccess/network.py @@ -1,4 +1,5 @@ import time +import os import geopy from geopy import distance @@ -17,7 +18,7 @@ class urbanaccess_network(object): """ - A urbanaccess object of Pandas DataFrames representing + An urbanaccess object of Pandas DataFrames representing the components of a graph network Parameters @@ -300,7 +301,7 @@ def _route_id_to_node(stops_df, edges_w_routes): stops_df : pandas.DataFrame processed gtfs stops DataFrame edges_w_routes : pandas.DataFrame - transit edge DataFrame that has route id information + transit edge DataFrame that has route ID information Returns ------- @@ -309,7 +310,7 @@ def _route_id_to_node(stops_df, edges_w_routes): """ start_time = time.time() - # create unique stop ids + # create unique stop IDs stops_df['unique_stop_id'] = ( stops_df['stop_id'].str.cat( stops_df['unique_agency_id'].astype('str'), sep='_')) @@ -345,7 +346,7 @@ def _route_id_to_node(stops_df, edges_w_routes): transit_nodes_wroutes.drop_duplicates(subset='node_id_route', keep='first', inplace=True) - # set node index to be unique stop id + # set node index to be unique stop ID transit_nodes_wroutes = transit_nodes_wroutes.set_index('node_id_route') log( @@ -369,7 +370,7 @@ def _connector_edges(osm_nodes, transit_nodes, travel_speed_mph=3): transit nodes DataFrame travel_speed_mph : int, optional travel speed to use to calculate travel time across a - distance on a edge. units are in miles per hour (MPH) + distance on an edge. units are in miles per hour (MPH) for pedestrian travel this is assumed to be 3 MPH Returns @@ -421,9 +422,8 @@ def _connector_edges(osm_nodes, transit_nodes, travel_speed_mph=3): def _format_pandana_edges_nodes(edge_df, node_df): """ Perform final formatting on nodes and edge DataFrames to prepare them - for use in Pandana. - Formatting mainly consists of creating a unique node id and edge from - and to id that is an integer + for use in Pandana. Formatting mainly consists of creating an unique + node ID and edge from and to ID that is an integer per Pandana requirements. Parameters @@ -440,7 +440,7 @@ def _format_pandana_edges_nodes(edge_df, node_df): """ start_time = time.time() - # pandana requires ids that are integer: for nodes - make it the index, + # Pandana requires IDs that are integer: for nodes - make it the index, # for edges make it the from and to columns node_df['id_int'] = range(1, len(node_df) + 1) @@ -460,7 +460,7 @@ def _format_pandana_edges_nodes(edge_df, node_df): try: edge_df_wnumericid[col] = edge_df_wnumericid[col].astype(str) # deal with edge cases where typically the name of a street is not - # in a uniform string encoding such as names with accents + # in an uniform string encoding such as names with accents except UnicodeEncodeError: log('Fixed unicode error in {} column'.format(col)) edge_df_wnumericid[col] = edge_df_wnumericid[col].str.encode( @@ -472,7 +472,7 @@ def _format_pandana_edges_nodes(edge_df, node_df): if 'nearest_osm_node' in node_df.columns: node_df.drop(['nearest_osm_node'], axis=1, inplace=True) - log('Edge and node tables formatted for Pandana with integer node ids: ' + log('Edge and node tables formatted for Pandana with integer node IDs: ' 'id_int, to_int, and from_int. Took {:,.2f} seconds'.format( time.time() - start_time)) return edge_df_wnumericid, node_df @@ -482,28 +482,29 @@ def save_network(urbanaccess_network, filename, dir=config.settings.data_folder, overwrite_key=False, overwrite_hdf5=False): """ - Write a urbanaccess_network integrated nodes and edges to a node and edge - table in a hdf5 file + Write urbanaccess_network integrated nodes and edges to a node and edge + table in a HDF5 file Parameters ---------- urbanaccess_network : object urbanaccess_network object with net_edges and net_nodes DataFrames filename : string - name of the hdf5 file to save with .h5 extension + name of the HDF5 file to save with .h5 extension dir : string, optional - directory to save hdf5 file + directory to save HDF5 file overwrite_key : bool, optional if true any existing table with the specified key name will be overwritten overwrite_hdf5 : bool, optional - if true any existing hdf5 file with the specified name in the + if true any existing HDF5 file with the specified name in the specified directory will be overwritten Returns ------- None """ + log('Writing HDF5 store...') if urbanaccess_network is None or urbanaccess_network.net_edges.empty or \ urbanaccess_network.net_nodes.empty: raise ValueError('Either no urbanaccess_network specified or ' @@ -515,19 +516,21 @@ def save_network(urbanaccess_network, filename, df_to_hdf5(data=urbanaccess_network.net_nodes, key='nodes', overwrite_key=overwrite_key, dir=dir, filename=filename, overwrite_hdf5=overwrite_hdf5) + log("Saved HDF5 store: {} with tables: ['net_edges', 'net_nodes'].".format( + os.path.join(dir, filename))) def load_network(dir=config.settings.data_folder, filename=None): """ - Read an integrated network node and edge data from a hdf5 file to - a urbanaccess_network object + Read an integrated network node and edge data from a HDF5 file to + an urbanaccess_network object Parameters ---------- dir : string, optional - directory to read hdf5 file + directory to read HDF5 file filename : string - name of the hdf5 file to read with .h5 extension + name of the HDF5 file to read with .h5 extension Returns ------- @@ -536,7 +539,10 @@ def load_network(dir=config.settings.data_folder, filename=None): ua_network.net_edges : object ua_network.net_nodes : object """ + log('Loading HDF5 store...') ua_network.net_edges = hdf5_to_df(dir=dir, filename=filename, key='edges') ua_network.net_nodes = hdf5_to_df(dir=dir, filename=filename, key='nodes') + log("Read HDF5 store: {} tables: ['net_edges', 'net_nodes'].".format( + os.path.join(dir, filename))) return ua_network diff --git a/urbanaccess/osm/load.py b/urbanaccess/osm/load.py index 567b1d6..261b92c 100644 --- a/urbanaccess/osm/load.py +++ b/urbanaccess/osm/load.py @@ -82,7 +82,7 @@ def ua_network_from_bbox(lat_min=None, lng_min=None, lat_max=None, # remove low connectivity nodes and return cleaned nodes and edges if remove_lcn: - log('checking for low connectivity nodes...') + log('Checking for low connectivity nodes...') pandana_net = Network(nodes['x'], nodes['y'], edges['from'], edges['to'], edges[['distance']]) lcn = pandana_net.low_connectivity_nodes(impedance=10000, count=10, diff --git a/urbanaccess/osm/network.py b/urbanaccess/osm/network.py index e5f6fd3..3f3779a 100644 --- a/urbanaccess/osm/network.py +++ b/urbanaccess/osm/network.py @@ -18,7 +18,7 @@ def create_osm_net(osm_edges, osm_nodes, osm node dataframe travel_speed_mph : int, optional travel speed to use to calculate travel time across a - distance on a edge. units are in miles per hour (MPH) + distance on an edge. units are in miles per hour (MPH) for pedestrian travel this is assumed to be 3 MPH network_type : str, optional default is 'walk' for the osm pedestrian network. diff --git a/urbanaccess/plot.py b/urbanaccess/plot.py index ab24594..7288a40 100644 --- a/urbanaccess/plot.py +++ b/urbanaccess/plot.py @@ -27,9 +27,9 @@ def plot_net(nodes, edges, x_col=None, y_col=None, from_col=None, y_col : str, optional y coordinate column in nodes dataframe from_col : str, optional - name of column to use for 'from' node id + name of column to use for 'from' node ID to_col : str, optional - name of column to use for 'to' node id + name of column to use for 'to' node ID bbox : tuple, optional Bounding box formatted as a 4 element tuple: (lng_max, lat_min, lng_min, lat_max) @@ -248,9 +248,9 @@ def _prep_edges(edges, nodes, from_col, to_col, nodes : pandas.DataFrame edges : pandas.DataFrame from_col : string - name of column to use for 'from' node id + name of column to use for 'from' node ID to_col : string - name of column to use for 'to' node id + name of column to use for 'to' node ID x_col : string name of column to use for 'x' node coordinates y_col : string @@ -259,8 +259,8 @@ def _prep_edges(edges, nodes, from_col, to_col, Returns ------- edges_wline : pandas.DataFrame - the edge dataframe with from and to x y coordinates and - ids to build lines + the edge dataframe with from and to x and y coordinates and + IDs to build lines """ if x_col not in nodes.columns or y_col not in nodes.columns: diff --git a/urbanaccess/tests/conftest.py b/urbanaccess/tests/conftest.py index 5ca8297..a3e1db7 100644 --- a/urbanaccess/tests/conftest.py +++ b/urbanaccess/tests/conftest.py @@ -373,7 +373,8 @@ def calendar_dates_feed_1(): 'weekday-3', 'weekend-1'], 'date': [20161224, 20170318, 20160424, 20161230], - 'exception_type': [1, 2, 1, 1]} + 'exception_type': [1, 2, 1, 1], + 'schedule_type': ['WD', 'WD', 'WD', 'WE']} index = range(4) diff --git a/urbanaccess/tests/test_gtfs_load.py b/urbanaccess/tests/test_gtfs_load.py index f813f27..4733376 100644 --- a/urbanaccess/tests/test_gtfs_load.py +++ b/urbanaccess/tests/test_gtfs_load.py @@ -15,7 +15,7 @@ def expected_urbanaccess_gtfs_df_keys(): expected_keys = ['stops', 'routes', 'trips', 'stop_times', 'calendar', 'calendar_dates', 'stop_times_int', 'headways'] - return expected_keys.sort() + return sorted(expected_keys) @pytest.fixture @@ -119,8 +119,8 @@ def test_loadgtfsfeed_to_df_wo_calendar( urbanaccess_gtfs_df_info = vars(loaded_feeds) expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar_dates'] - assert expected_urbanaccess_gtfs_df_keys == list( - urbanaccess_gtfs_df_info.keys()).sort() + assert expected_urbanaccess_gtfs_df_keys == sorted(list( + urbanaccess_gtfs_df_info.keys())) for key, value in urbanaccess_gtfs_df_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty @@ -143,8 +143,8 @@ def test_loadgtfsfeed_to_df_wo_calendar_dates( urbanaccess_gtfs_df_info = vars(loaded_feeds) expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar'] - assert expected_urbanaccess_gtfs_df_keys == list( - urbanaccess_gtfs_df_info.keys()).sort() + assert expected_urbanaccess_gtfs_df_keys == sorted(list( + urbanaccess_gtfs_df_info.keys())) for key, value in urbanaccess_gtfs_df_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty @@ -167,8 +167,8 @@ def test_loadgtfsfeed_to_df_w_calendar_and_calendar_dates( urbanaccess_gtfs_df_info = vars(loaded_feeds) expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar', 'calendar_dates'] - assert expected_urbanaccess_gtfs_df_keys == list( - urbanaccess_gtfs_df_info.keys()).sort() + assert expected_urbanaccess_gtfs_df_keys == sorted(list( + urbanaccess_gtfs_df_info.keys())) for key, value in urbanaccess_gtfs_df_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty @@ -222,10 +222,9 @@ def test_loadgtfsfeed_to_df_wo_agency( append_definitions=False) assert isinstance(loaded_feeds, urbanaccess_gtfs_df) urbanaccess_gtfs_df_info = vars(loaded_feeds) - expected_dfs = ['stops', 'routes', 'trips', 'stop_times', - 'calendar'] - assert expected_urbanaccess_gtfs_df_keys == list( - urbanaccess_gtfs_df_info.keys()).sort() + expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar'] + assert expected_urbanaccess_gtfs_df_keys == sorted(list( + urbanaccess_gtfs_df_info.keys())) for key, value in urbanaccess_gtfs_df_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty diff --git a/urbanaccess/tests/test_gtfs_network.py b/urbanaccess/tests/test_gtfs_network.py index 01de1c8..6e01323 100644 --- a/urbanaccess/tests/test_gtfs_network.py +++ b/urbanaccess/tests/test_gtfs_network.py @@ -1,25 +1,62 @@ import pytest +import os +import time +import glob import pandas as pd import numpy as np import urbanaccess.gtfs.network as gtfs_network import urbanaccess.gtfs.load as gtfs_load from urbanaccess.network import urbanaccess_network +from urbanaccess.gtfs.gtfsfeeds_dataframe import urbanaccess_gtfs_df @pytest.fixture def expected_urbanaccess_network_keys(): expected_keys = ['transit_nodes', 'transit_edges', 'net_connector_edges', 'osm_nodes', 'osm_edges', 'net_nodes', 'net_edges'] - return expected_keys.sort() + return sorted(expected_keys) + + +@pytest.fixture +def expected_gtfsfeeds_dfs_keys(): + expected_keys = ['stops', 'routes', 'trips', 'stop_times', + 'calendar_dates', 'calendar', 'stop_times_int', + 'headways'] + return sorted(expected_keys) @pytest.fixture def gtfs_feed_wo_calendar_dates( - tmpdir, agency_a_feed_on_disk_wo_calendar_dates): - feed_dir = agency_a_feed_on_disk_wo_calendar_dates + agency_a_feed_on_disk_wo_calendar_dates): + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=agency_a_feed_on_disk_wo_calendar_dates, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + return loaded_feeds + + +@pytest.fixture +def gtfs_feed_wo_calendar( + agency_a_feed_on_disk_wo_calendar): + loaded_feeds = gtfs_load.gtfsfeed_to_df( + gtfsfeed_path=agency_a_feed_on_disk_wo_calendar, + validation=False, + verbose=True, + bbox=None, + remove_stops_outsidebbox=False, + append_definitions=False) + return loaded_feeds + + +@pytest.fixture +def gtfs_feed_w_calendar_and_calendar_dates( + agency_a_feed_on_disk_w_calendar_and_calendar_dates): loaded_feeds = gtfs_load.gtfsfeed_to_df( - gtfsfeed_path=feed_dir, + gtfsfeed_path=agency_a_feed_on_disk_w_calendar_and_calendar_dates, validation=False, verbose=True, bbox=None, @@ -28,36 +65,79 @@ def gtfs_feed_wo_calendar_dates( return loaded_feeds +@pytest.fixture +def selected_int_stop_times_from_feed_wo_calendar_dates( + gtfs_feed_wo_calendar_dates): + # reproduce what is expected as the 'selected_interpolated_stop_times_df' + stop_times = gtfs_feed_wo_calendar_dates.stop_times.copy() + stop_times = stop_times.loc[stop_times['trip_id'] == 'a3'] + stop_times['unique_stop_id'] = ( + stop_times['stop_id'].str.cat( + stop_times['unique_agency_id'].astype('str'), sep='_')) + stop_times['unique_trip_id'] = ( + stop_times['trip_id'].str.cat( + stop_times['unique_agency_id'].astype('str'), sep='_')) + data = { + 'departure_time_sec_interpolate': [29700, 30000, 30300, + 30600, 30900, 31200], + 'timediff': [np.nan, 300.0, 300.0, 300.0, 300.0, 300.0] + } + index = range(12, 18) + df = pd.DataFrame(data, index) + stop_times = pd.concat([stop_times, df], axis=1) + + return stop_times + + +@pytest.fixture +def selected_stops_from_feed_wo_calendar_dates(gtfs_feed_wo_calendar_dates): + # create 'final_selected_stops' df that is used as input to test function + stops_df = gtfs_feed_wo_calendar_dates.stops.copy() + stops_df = stops_df.iloc[0:6] + stops_df['unique_stop_id'] = ( + stops_df['stop_id'].str.cat( + stops_df['unique_agency_id'].astype('str'), sep='_')) + stops_df.set_index('unique_stop_id', drop=True, inplace=True) + stops_df.index.name = 'node_id' + return stops_df + + @pytest.fixture def stop_times(): data = { - 'unique_agency_id': ['citytrains'] * 25, + 'unique_agency_id': ['citytrains'] * 35, 'trip_id': ['a', 'a', 'a', 'a', 'a', 'b', 'b', 'b', 'b', 'b', 'c', 'c', 'c', 'c', 'c', 'd', 'd', 'd', 'd', 'd', - 'e', 'e', 'e', 'e', 'e'], - 'stop_id': str(range(25)), + 'e', 'e', 'e', 'e', 'e', + 'f', 'f', 'f', 'f', 'f', + 'g', 'g', 'g', 'g', 'g'], + 'stop_id': range(1, 36), 'departure_time_sec': [1, 2, np.nan, np.nan, 5, 1, 2, 3, 4, np.nan, np.nan, np.nan, 3, 4, np.nan, 1, 2, 3, 4, 5, - 1, np.nan, 3, 4, np.nan], - 'stop_sequence': [1, 2, 3, 4, 5] * 5 + 1, np.nan, 3, 4, np.nan, + 1, np.nan, 3, 4, 5, + np.nan, 2, 3, 4, 5], + 'stop_sequence': [1, 2, 3, 4, 5] * 7 } - index = range(25) + index = range(35) df = pd.DataFrame(data, index) + df['stop_id'] = df['stop_id'].astype('str') + return df @pytest.fixture def calendar(): data = { - 'unique_agency_id': ['citytrains'] * 4, - 'trip_id': ['a', 'b', 'c', 'e'] + 'unique_agency_id': ['citytrains'] * 6, + 'trip_id': ['a', 'b', 'c', 'e', 'f', 'g'] } - index = range(4) + index = range(6) df = pd.DataFrame(data, index) return df @@ -105,9 +185,151 @@ def stop_times_interpolated(): return df +@pytest.fixture +def transit_edge_from_feed_wo_calendar_dates(): + data = { + 'node_id_from': ['1_agency_a_city_a', '2_agency_a_city_a', + '3_agency_a_city_a', '4_agency_a_city_a', + '5_agency_a_city_a'], + 'node_id_to': ['2_agency_a_city_a', '3_agency_a_city_a', + '4_agency_a_city_a', '5_agency_a_city_a', + '6_agency_a_city_a'], + 'weight': [300.0] * 5, + 'unique_agency_id': ['agency_a_city_a'] * 5, + 'unique_trip_id': ['a3_agency_a_city_a'] * 5, + 'sequence': range(1, 6), + 'id': ['a3_agency_a_city_a_1', 'a3_agency_a_city_a_2', + 'a3_agency_a_city_a_3', 'a3_agency_a_city_a_4', + 'a3_agency_a_city_a_5'], + } + index = range(5) + + df = pd.DataFrame(data, index) + return df + + +@pytest.fixture +def expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1( + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): + # represents df prior to being post-processed downstream + df = expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + df.drop(columns=['route_type'], inplace=True) + # convert weight from min to sec to represent df prior to post-process step + df['weight'] = 300.0 + return df + + +@pytest.fixture +def expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2(): + # represents df after it has been post-processed downstream + data = { + 'node_id_from': ['1_agency_a_city_a', '2_agency_a_city_a', + '3_agency_a_city_a', '4_agency_a_city_a', + '5_agency_a_city_a'], + 'node_id_to': ['2_agency_a_city_a', '3_agency_a_city_a', + '4_agency_a_city_a', '5_agency_a_city_a', + '6_agency_a_city_a'], + 'weight': [5.0] * 5, + 'unique_agency_id': ['agency_a_city_a'] * 5, + 'unique_trip_id': ['a3_agency_a_city_a'] * 5, + 'sequence': range(1, 6), + 'id': ['a3_agency_a_city_a_1', 'a3_agency_a_city_a_2', + 'a3_agency_a_city_a_3', 'a3_agency_a_city_a_4', + 'a3_agency_a_city_a_5'], + 'route_type': [3] * 5 + } + index = range(5) + df = pd.DataFrame(data, index) + # raw data are read as int32 + df['sequence'] = df['sequence'].astype('int32') + return df + + +@pytest.fixture +def expected_final_transit_edge_from_feed_wo_calendar_dates( + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): + data = { + 'unique_route_id': ['10-101_agency_a_city_a'] * 5, + 'net_type': ['transit'] * 5 + } + index = range(5) + df = pd.DataFrame(data, index) + df = pd.concat( + [expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2, df], + axis=1) + return df + + +@pytest.fixture +def expected_transit_node_from_feed_wo_calendar_dates(): + data = { + 'node_id': ['1_agency_a_city_a', '2_agency_a_city_a', + '3_agency_a_city_a', '4_agency_a_city_a', + '5_agency_a_city_a', '6_agency_a_city_a'], + 'x': [-122.265609, -122.224274, -122.271604, -122.269029, -122.267227, + -122.251793], + 'y': [37.797484, 37.774963, 37.803664, 37.80787, 37.828415, 37.844601], + 'unique_agency_id': ['agency_a_city_a'] * 6, + 'route_type': [3] * 6, + 'stop_id': range(1, 7), + 'stop_name': ['ave a', 'ave b', 'ave c', 'ave d', 'ave e', 'ave f'], + 'wheelchair_boarding': [1, 0, 0, 0, 0, 0], + 'location_type': [1] * 6 + } + index = range(6) + + df = pd.DataFrame(data, index) + df['stop_id'] = df['stop_id'].astype('str') + df.set_index('node_id', drop=True, inplace=True) + return df + + +@pytest.fixture +def edge_route_type_impedance_df(): + data = { + 'weight': [2, 2, 2, 3, 3, 3, 5, 5, 5, 5], + 'route_type': [1, 1, 1, 2, 2, 2, 3, 3, 3, 3] + } + index = range(10) + + df = pd.DataFrame(data, index) + + return df + + +@pytest.fixture() +def hdf5_file_on_disk_gtfsfeeds_dfs( + tmpdir, + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + hdf5_dict = {'stop_times': gtfs_feed_wo_calendar_dates.stop_times, + 'stops': gtfs_feed_wo_calendar_dates.stops, + 'routes': gtfs_feed_wo_calendar_dates.routes, + 'trips': gtfs_feed_wo_calendar_dates.trips, + 'calendar': gtfs_feed_wo_calendar_dates.calendar, + 'stop_times_int': + selected_int_stop_times_from_feed_wo_calendar_dates} + hdf5_save_path = os.path.join(tmpdir.strpath, 'test_hdf5_load') + hdf5_file = os.path.join(hdf5_save_path, 'test_file.h5') + os.makedirs(hdf5_save_path) + print('writing test HDF5 to: {}'.format(hdf5_file)) + # create the HDF5 + store = pd.HDFStore(hdf5_file) + store.close() + # add keys and DFs to HDF5 + for key, df in hdf5_dict.items(): + store = pd.HDFStore(hdf5_file, mode='r') + store.close() + df.to_hdf(hdf5_file, key=key, mode='a', format='table') + return hdf5_save_path + + def test_create_transit_net_wo_calendar_dates( - tmpdir, gtfs_feed_wo_calendar_dates, - expected_urbanaccess_network_keys): + gtfs_feed_wo_calendar_dates, + expected_urbanaccess_network_keys, + expected_final_transit_edge_from_feed_wo_calendar_dates): + expected_result = \ + expected_final_transit_edge_from_feed_wo_calendar_dates.copy() transit_net = gtfs_network.create_transit_net( gtfs_feed_wo_calendar_dates, day='monday', timerange=['07:00:00', '10:00:00'], @@ -115,22 +337,73 @@ def test_create_transit_net_wo_calendar_dates( overwrite_existing_stop_times_int=False, use_existing_stop_times_int=False, save_processed_gtfs=False, - save_dir=tmpdir, + save_dir=None, save_filename=None) assert isinstance(transit_net, urbanaccess_network) urbanaccess_network_info = vars(transit_net) expected_dfs = ['transit_nodes', 'transit_edges'] - assert expected_urbanaccess_network_keys == list( - urbanaccess_network_info.keys()).sort() + assert expected_urbanaccess_network_keys == sorted(list( + urbanaccess_network_info.keys())) for key, value in urbanaccess_network_info.items(): assert isinstance(value, pd.core.frame.DataFrame) # check that df is not empty if key in expected_dfs: assert value.empty is False + result_edge = transit_net.transit_edges.copy() + # test that output df is identical to expected df + result_edge = result_edge.reindex( + sorted(result_edge.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result_edge['sequence'] = result_edge['sequence'].astype('int32') + assert result_edge.equals(expected_result) -def test_create_transit_net_wo_req_file( - tmpdir, gtfs_feed_wo_calendar_dates): + +def test_create_transit_net_wo_direction_id( + gtfs_feed_wo_calendar_dates, + expected_urbanaccess_network_keys, + expected_final_transit_edge_from_feed_wo_calendar_dates): + expected_result = \ + expected_final_transit_edge_from_feed_wo_calendar_dates.copy() + # remove 'direction_id' col for test + gtfs_feed_wo_calendar_dates.trips.drop( + columns=['direction_id'], inplace=True) + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + assert isinstance(transit_net, urbanaccess_network) + urbanaccess_network_info = vars(transit_net) + expected_dfs = ['transit_nodes', 'transit_edges'] + assert expected_urbanaccess_network_keys == sorted(list( + urbanaccess_network_info.keys())) + for key, value in urbanaccess_network_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + + result_edge = transit_net.transit_edges.copy() + # test that output df is identical to expected df + result_edge = result_edge.reindex( + sorted(result_edge.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result_edge['sequence'] = result_edge['sequence'].astype('int32') + assert result_edge.equals(expected_result) + + +def test_create_transit_net_wo_req_file(gtfs_feed_wo_calendar_dates): # set trips df to blank df for test gtfs_feed_wo_calendar_dates.trips = pd.DataFrame() with pytest.raises(ValueError) as excinfo: @@ -141,37 +414,245 @@ def test_create_transit_net_wo_req_file( overwrite_existing_stop_times_int=False, use_existing_stop_times_int=False, save_processed_gtfs=False, - save_dir=tmpdir, + save_dir=None, save_filename=None) expected_error = ( - "one of the following gtfsfeeds_dfs objects trips, stops, " + "One of the following gtfsfeeds_dfs objects: trips, stops, " "or stop_times were found to be empty.") assert expected_error in str(excinfo.value) def test_create_transit_net_wo_calendar_and_calendar_dates( - tmpdir, gtfs_feed_wo_calendar_dates): + gtfs_feed_wo_calendar_dates): # set calendar_dates and calendar dfs to blank df for test gtfs_feed_wo_calendar_dates.calendar_dates = pd.DataFrame() gtfs_feed_wo_calendar_dates.calendar = pd.DataFrame() with pytest.raises(ValueError) as excinfo: transit_net = gtfs_network.create_transit_net( gtfs_feed_wo_calendar_dates, day='monday', - timerange=['07:00:00', '10:00:00'], + timerange=['07:00:00', '11:00:00'], calendar_dates_lookup=None, overwrite_existing_stop_times_int=False, use_existing_stop_times_int=False, save_processed_gtfs=False, - save_dir=tmpdir, + save_dir=None, save_filename=None) expected_error = ( - "one of the following gtfsfeeds_dfs objects calendar or " + "One of the following gtfsfeeds_dfs objects: calendar or " "calendar_dates were found to be empty.") assert expected_error in str(excinfo.value) +def test_create_transit_net_invalid_params(gtfs_feed_wo_calendar_dates): + msg = ('starttime and endtime are not in the correct format. ' + 'Format should be a 24 hour clock in the following format: ' + '08:00:00 or 17:00:00.') + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['7:00:0', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("['7:00:0', '10:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("['10:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange='10:00:00', + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("10:00:00 {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=[100000, 170000], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("[100000, 170000] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['10:00:00', '07:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ("['10:00:00', '07:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=2, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = "overwrite_existing_stop_times_int must be bool." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=2, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = "use_existing_stop_times_int must be bool." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=2, + save_dir=None, + save_filename=None) + expected_error = "save_processed_gtfs must be bool." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + None, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = "gtfsfeeds_dfs must be an urbanaccess_gtfs_df object." + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=True, + use_existing_stop_times_int=True, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + expected_error = ('overwrite_existing_stop_times_int and ' + 'use_existing_stop_times_int cannot both be True.') + assert expected_error in str(excinfo.value) + + +def test_create_transit_net_overwrite_stop_times_int_True( + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + # populate stop_times_int for test that is different than the one that + # would be calculated + df = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + df['timediff'] = df['timediff'] * 2 + gtfs_feed_wo_calendar_dates.stop_times_int = df + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=True, + use_existing_stop_times_int=False, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + # values should be different given overwrite_existing_stop_times_int = True + assert gtfs_feed_wo_calendar_dates.stop_times_int['timediff'].equals( + df['timediff']) is False + + +def test_create_transit_net_use_existing_stop_times_int_True( + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + # populate stop_times_int for test that is different than the one that + # would be calculated + df = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + df['timediff'] = df['timediff'] * 2 + gtfs_feed_wo_calendar_dates.stop_times_int = df + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=True, + save_processed_gtfs=False, + save_dir=None, + save_filename=None) + # values should be the the same since use_existing_stop_times_int = True + assert gtfs_feed_wo_calendar_dates.stop_times_int['timediff'].equals( + df['timediff']) + + +def test_create_transit_net_save_processed_gtfs_True( + tmpdir, gtfs_feed_wo_calendar_dates): + dir_path = os.path.join(tmpdir.strpath, 'test_hdf5_save') + os.makedirs(dir_path) + print('preparing test dir: {}'.format(dir_path)) + + transit_net = gtfs_network.create_transit_net( + gtfs_feed_wo_calendar_dates, day='monday', + timerange=['07:00:00', '10:00:00'], + calendar_dates_lookup=None, + overwrite_existing_stop_times_int=False, + use_existing_stop_times_int=False, + save_processed_gtfs=True, + save_dir=dir_path, + save_filename='test_file.h5') + + # test that file was written as expected + file_list = glob.glob(r"{}/*.h5".format(dir_path)) + file_path = file_list[0] + file_name = os.path.basename(file_path) + assert file_name == 'test_file.h5' + # test HDF5 store + expected_keys = {'/calendar', '/routes', '/stop_times', '/stop_times_int', + '/stops', '/trips'} + with pd.HDFStore(file_path) as store: + result_keys = set(store.keys()) + assert result_keys == expected_keys + # check that data exists in each DataFrame + for key in expected_keys: + df = store[key] + assert df.empty is False + + def test_interpolator(stop_times, calendar): + # profile run times as _interpolate_stop_times() is a + # function that is critical to have fast run times + start_time = time.time() df = gtfs_network._interpolate_stop_times(stop_times, calendar) + print('Run time: {}'.format(time.time() - start_time)) # unique_trip_id should be generated assert df.loc[1, 'unique_trip_id'] == 'a_citytrains' @@ -180,8 +661,8 @@ def test_interpolator(stop_times, calendar): assert df.loc[df.trip_id == 'a', 'departure_time_sec_interpolate'].tolist() == [1, 2, 3, 4, 5] - # trip 'b' should be skipped because it has only one null value - # but its null value should be removed + # trip 'b' should be skipped because it has only one null value and + # its in the last position but its null value should be removed assert df.loc[df.trip_id == 'b', 'departure_time_sec_interpolate'].tolist() == [1, 2, 3, 4] @@ -198,42 +679,344 @@ def test_interpolator(stop_times, calendar): assert df.loc[df.trip_id == 'e', 'departure_time_sec_interpolate'].tolist() == [1, 2, 3, 4] + # TODO: This is a rare and unlikely case that should be supported + # in the future and when addressed we expect [1, 2, 3, 4, 5] for trip 'f' + # trip 'f' should be interpolated fully, + # the one NA in the middle of the sequence should be filled + # trip 'f' should be skipped because it has only one null value and + # its not a first or last value in sequence, but its null value should + # be removed + assert df.loc[df.trip_id == 'f', + 'departure_time_sec_interpolate'].tolist() == [1, 3, 4, 5] + + # trip 'g' should be interpolated + # no starting value, so first time removed + # NaN values should be removed from start + assert df.loc[df.trip_id == 'g', + 'departure_time_sec_interpolate'].tolist() == [2, 3, 4, 5] + + +def test_interpolator_w_missing_stop_sequence(stop_times, calendar): + # create nulls in stop_times 'stop_sequence' col + stop_times['stop_sequence'][1:4] = np.nan + stop_times['stop_sequence'][10:12] = np.nan + with pytest.raises(ValueError) as excinfo: + df = gtfs_network._interpolate_stop_times(stop_times, calendar) + expected_error = ("Found duplicate values when values from stop_sequence " + "and unique_trip_id are combined. Check values in " + "these columns for trip_id(s): " + "['a_citytrains', 'c_citytrains'].") + assert expected_error in str(excinfo.value) + + +def test_interpolator_w_mismatch_trip_ids(stop_times, calendar): + # create nulls in stop_times 'stop_sequence' col + stop_times['trip_id'] = stop_times['trip_id'] + ' ' + + with pytest.raises(ValueError) as excinfo: + df = gtfs_network._interpolate_stop_times(stop_times, calendar) + expected_error = ("No matching trip_ids where found. " + "Suggest checking for differences between trip_id " + "values in stop_times and trips GTFS files.") + assert expected_error in str(excinfo.value) + + +def test_interpolator_w_index_as_col(stop_times, calendar): + # set name on index that also exists as a col to run test + stop_times.index.rename('unique_agency_id', inplace=True) + df = gtfs_network._interpolate_stop_times(stop_times, calendar) + # no errors should occur so only need to check df is not empty + assert df.empty is False + def test_skip_interpolator(stop_times, calendar): series = pd.Series(data=[1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5], - index=range(25), + index=range(35), name='departure_time_sec') - stop_times['departure_time_sec'] = series - df = gtfs_network._interpolate_stop_times(stop_times, calendar) # everything should be the same, # with one row dropped for calendar day filter assert df.departure_time_sec_interpolate.tolist() == [1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, + 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5] -def test_edge_reformatter(stop_times_interpolated): +def test_trip_schedule_selector_wo_cal_dates(gtfs_feed_wo_calendar_dates): + expected_result = gtfs_feed_wo_calendar_dates.trips.copy() + # create expected trips result + expected_result.reset_index(drop=True, inplace=True) + expected_result = expected_result.iloc[0:8] + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar_dates.trips, + input_calendar_df=gtfs_feed_wo_calendar_dates.calendar, + input_calendar_dates_df=gtfs_feed_wo_calendar_dates.calendar_dates, + day='monday', + calendar_dates_lookup=None) + + assert len(result) == 8 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_wo_cal_dates_wo_direction_id( + gtfs_feed_wo_calendar_dates): + # remove 'direction_id' col for test + trips_df = gtfs_feed_wo_calendar_dates.trips.copy() + trips_df.drop(columns=['direction_id'], inplace=True) + expected_result = gtfs_feed_wo_calendar_dates.trips.copy() + # create expected trips result + expected_result.reset_index(drop=True, inplace=True) + expected_result.drop(columns=['direction_id'], inplace=True) + expected_result = expected_result.iloc[0:8] + + result = gtfs_network._trip_schedule_selector( + input_trips_df=trips_df, + input_calendar_df=gtfs_feed_wo_calendar_dates.calendar, + input_calendar_dates_df=gtfs_feed_wo_calendar_dates.calendar_dates, + day='monday', + calendar_dates_lookup=None) + + assert len(result) == 8 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_w_cal_dates(gtfs_feed_wo_calendar): + expected_result = gtfs_feed_wo_calendar.trips.copy() + # create expected trips result + expected_result = expected_result.iloc[4:10] + expected_result.reset_index(drop=True, inplace=True) + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar.trips, + input_calendar_df=gtfs_feed_wo_calendar.calendar, + input_calendar_dates_df=gtfs_feed_wo_calendar.calendar_dates, + day='sunday', + calendar_dates_lookup={'schedule_type': 'WE', + 'service_id': ['weekday-3', 'weekday-2']}) + + assert len(result) == 6 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_w_cal_and_cal_dates( + gtfs_feed_w_calendar_and_calendar_dates): + trips_df = gtfs_feed_w_calendar_and_calendar_dates.trips.copy() + cal_df = gtfs_feed_w_calendar_and_calendar_dates.calendar.copy() + cal_dates_df = gtfs_feed_w_calendar_and_calendar_dates.calendar_dates \ + .copy() + expected_result = gtfs_feed_w_calendar_and_calendar_dates.trips.copy() + result = gtfs_network._trip_schedule_selector( + input_trips_df=trips_df, + input_calendar_df=cal_df, + input_calendar_dates_df=cal_dates_df, + day='monday', + calendar_dates_lookup={'schedule_type': 'WE'}) + + assert len(result) == 10 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_w_cal_and_cal_dates_wo_lookup( + gtfs_feed_w_calendar_and_calendar_dates): + trips_df_1 = gtfs_feed_w_calendar_and_calendar_dates.trips.copy() + cal_df = gtfs_feed_w_calendar_and_calendar_dates.calendar.copy() + cal_dates_df_1 = gtfs_feed_w_calendar_and_calendar_dates.calendar_dates \ + .copy() + # create extra records in trips and calendar_dates for a different agency + # that do not exist in the calendar table + trips_df_2 = trips_df_1.copy() + trips_df_2['unique_agency_id'] = trips_df_2['unique_agency_id'] + '_x' + trips_df_2['unique_feed_id'] = trips_df_2['unique_feed_id'] + '_x' + trips_df_2 = trips_df_2.iloc[0:8] + trips_df_x2 = pd.concat( + [trips_df_1, trips_df_2], axis=0, + ignore_index=True) + cal_dates_df_2 = cal_dates_df_1.copy() + cal_dates_df_2['unique_agency_id'] = \ + cal_dates_df_2['unique_agency_id'] + '_x' + cal_dates_df_2['unique_feed_id'] = \ + cal_dates_df_2['unique_feed_id'] + '_x' + cal_dates_df_x2 = pd.concat( + [cal_dates_df_1, cal_dates_df_2], axis=0, + ignore_index=True) + # create expected trips result + expected_result = trips_df_1.copy() + expected_result = expected_result.iloc[0:8] + result = gtfs_network._trip_schedule_selector( + input_trips_df=trips_df_x2, + input_calendar_df=cal_df, + input_calendar_dates_df=cal_dates_df_x2, + day='monday', + calendar_dates_lookup=None) + + assert len(result) == 8 + assert result.equals(expected_result) + + +def test_trip_schedule_selector_wo_cal_dates_invalid_params( + gtfs_feed_wo_calendar_dates): + gtfs_feed = gtfs_feed_wo_calendar_dates + # test with invalid 'day' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday ', + calendar_dates_lookup=None) + expected_error = ( + "Incorrect day specified. Must be one of lowercase strings: 'monday'," + " 'tuesday', 'wednesday', 'thursday', 'friday', 'saturday', 'sunday'.") + assert expected_error in str(excinfo.value) + # test with invalid 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday', + calendar_dates_lookup=['invalid']) + expected_error = "calendar_dates_lookup parameter must be a dictionary." + assert expected_error in str(excinfo.value) + # test with invalid 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday', + calendar_dates_lookup={1: 'WD'}) + expected_error = "calendar_dates_lookup key: 1 must be a string." + assert expected_error in str(excinfo.value) + # test with invalid 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday', + calendar_dates_lookup={'schedule_type': 1}) + expected_error = ("calendar_dates_lookup value: 1 must be a " + "string or a list of strings.") + assert expected_error in str(excinfo.value) + # test with invalid 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed.trips, + input_calendar_df=gtfs_feed.calendar, + input_calendar_dates_df=gtfs_feed.calendar_dates, + day='monday', + calendar_dates_lookup={'schedule_type': ['WD', 1]}) + expected_error = ("calendar_dates_lookup value: ['WD', 1] " + "must contain strings.") + assert expected_error in str(excinfo.value) + + +def test_trip_schedule_selector_w_cal_dates_invalid_params_1( + gtfs_feed_wo_calendar_dates): + # test with empty 'calendar_dates'df + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar_dates.trips, + input_calendar_df=gtfs_feed_wo_calendar_dates.calendar, + input_calendar_dates_df=gtfs_feed_wo_calendar_dates.calendar_dates, + day='monday', + calendar_dates_lookup={'schedule_type': 'WD'}) + expected_error = ("calendar_dates_df is empty. Unable to use the " + "calendar_dates_lookup parameter.") + assert expected_error in str(excinfo.value) + + +def test_trip_schedule_selector_w_cal_dates_invalid_params_2( + gtfs_feed_wo_calendar): + # create invalid data in calendar dates file + cal_dates_df = gtfs_feed_wo_calendar.calendar_dates.copy() + series = pd.Series( + data=[1, 1, 2, 2], index=range(4), + name='invalid_dtype') + cal_dates_df['invalid_dtype'] = series + series = pd.Series( + data=[10, 11, 10, 'aa'], index=range(4), + name='day_type') + cal_dates_df['day_type'] = series + + # test with invalid col in 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar.trips, + input_calendar_df=gtfs_feed_wo_calendar.calendar, + input_calendar_dates_df=cal_dates_df, + day='monday', + calendar_dates_lookup={'invalid_col': 'WD'}) + expected_error = ("Column: invalid_col not found in calendar_dates " + "dataframe.") + assert expected_error in str(excinfo.value) + # test with invalid col dtype in 'calendar_dates_lookup' param + with pytest.raises(ValueError) as excinfo: + result = gtfs_network._trip_schedule_selector( + input_trips_df=gtfs_feed_wo_calendar.trips, + input_calendar_df=gtfs_feed_wo_calendar.calendar, + input_calendar_dates_df=cal_dates_df, + day='monday', + calendar_dates_lookup={'invalid_dtype': '1'}) + expected_error = ("Column: invalid_dtype must be object type.") + assert expected_error in str(excinfo.value) + + +def test_time_selector(selected_int_stop_times_from_feed_wo_calendar_dates): + timerange = ['08:20:00', '08:35:00'] + stop_times_int = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + result = gtfs_network._time_selector( + df=stop_times_int, + starttime=timerange[0], + endtime=timerange[1]) + + # create expected subset result + expected_result = stop_times_int.loc[14:15] + assert len(result) == 2 + assert result.equals(expected_result) + + +def test_time_difference(selected_int_stop_times_from_feed_wo_calendar_dates): + expected_result = \ + selected_int_stop_times_from_feed_wo_calendar_dates.copy() + stop_times_int = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + # create the 'stop_times_int' df expected + stop_times_int.drop(columns=['timediff'], inplace=True) + result = gtfs_network._time_difference(stop_times_df=stop_times_int) + + assert 'timediff' in result.columns + # all rows in sequence should not be null + assert result['timediff'][1:6].isnull().sum() == 0 + # only the first row in sequence should be null + assert result['timediff'][0:1].isnull().sum() == 1 + assert result.equals(expected_result) + + +def test_format_transit_net_edge_test_1(stop_times_interpolated): df = gtfs_network._format_transit_net_edge(stop_times_interpolated) # length of edge df should be 16 assert len(df) == 16 - # sequence id should be numeric starting at 1 and end at 4 for each trip + # sequence ID should be numeric starting at 1 and end at 4 for each trip assert df['sequence'][0] == 1 and df['sequence'][3] == 4 # edge df should have these columns and no null values for col in ['node_id_from', 'node_id_to', 'weight']: - assert col in df.columns and df[col].isnull().values.any() == False # noqa + assert col in df.columns and df[ + col].isnull().values.any() == False # noqa - # there should be 4 edges per trip id + # there should be 4 edges per trip ID for i, row in df.groupby('unique_trip_id').size().iteritems(): assert row == 4 @@ -249,3 +1032,342 @@ def test_edge_reformatter(stop_times_interpolated): 'unique_trip_id'][11] and \ df['unique_agency_id'][8] == stop_times_interpolated[ 'unique_agency_id'][11] # noqa + + +def test_format_transit_net_edge_test_2( + selected_int_stop_times_from_feed_wo_calendar_dates, + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1): + expected_result = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_1.copy() + + # create the 'selected_interpolated_stop_times_df' that is expected + stop_times_int = selected_int_stop_times_from_feed_wo_calendar_dates.copy() + # there are no missing time values in the test data so just use + # 'departure_time_sec' to generate the timediff col for the test + stop_times_int['timediff'] = stop_times_int.groupby('unique_trip_id')[ + 'departure_time_sec'].diff() + result = gtfs_network._format_transit_net_edge(stop_times_int) + + # test that output df is identical to expected df + result = result.reindex( + sorted(result.columns), axis=1) + expected_result = expected_result.reindex( + sorted(expected_result.columns), axis=1) + # ensure 'sequence' is int32 for test as other OS sometimes reads this as + # int64 and will cause tests to fail when using equals() + result['sequence'] = result['sequence'].astype('int32') + assert result.equals(expected_result) + + +def test_convert_imp_time_units( + transit_edge_from_feed_wo_calendar_dates): + # test with minutes + result_min = gtfs_network._convert_imp_time_units( + df=transit_edge_from_feed_wo_calendar_dates, + time_col='weight', convert_to='minutes') + expected_weight_as_min = pd.Series( + data=[5.0] * 5, index=range(5), name='weight') + assert result_min['weight'].equals(expected_weight_as_min) + + # test with seconds + # convert original weight of min to sec + transit_edge_from_feed_wo_calendar_dates['weight'] = expected_weight_as_min + result_sec = gtfs_network._convert_imp_time_units( + df=transit_edge_from_feed_wo_calendar_dates, + time_col='weight', convert_to='seconds') + expected_weight_as_sec = pd.Series( + data=[300.0] * 5, index=range(5), name='weight') + assert result_sec['weight'].equals(expected_weight_as_sec) + + +def test_convert_imp_time_units_invalid_params( + transit_edge_from_feed_wo_calendar_dates): + # test with invalid 'convert_to' param name + with pytest.raises(ValueError) as excinfo: + result_min = gtfs_network._convert_imp_time_units( + df=transit_edge_from_feed_wo_calendar_dates, + time_col='weight', convert_to='minutes_invalid') + expected_error = ("minutes_invalid is not a valid value " + "or is not a string.") + assert expected_error in str(excinfo.value) + # test with invalid 'convert_to' dtype + with pytest.raises(ValueError) as excinfo: + result_min = gtfs_network._convert_imp_time_units( + df=transit_edge_from_feed_wo_calendar_dates, + time_col='weight', convert_to=22) + expected_error = "22 is not a valid value or is not a string." + assert expected_error in str(excinfo.value) + + +def test_stops_in_edge_table_selector( + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + # created expected result + expected_result = gtfs_feed_wo_calendar_dates.stops[0:6] + expected_result['unique_stop_id'] = ( + expected_result['stop_id'].str.cat( + expected_result['unique_agency_id'].astype('str'), sep='_')) + + result = gtfs_network._stops_in_edge_table_selector( + input_stops_df=gtfs_feed_wo_calendar_dates.stops, + input_stop_times_df=selected_int_stop_times_from_feed_wo_calendar_dates + ) + + assert 'unique_stop_id' in result.columns + assert result['unique_stop_id'].isnull().sum() == 0 + assert result.equals(expected_result) + + +def test_format_transit_net_nodes( + selected_stops_from_feed_wo_calendar_dates, + expected_transit_node_from_feed_wo_calendar_dates): + expected_result = expected_transit_node_from_feed_wo_calendar_dates.copy() + expected_cols = ['x', 'y', 'unique_agency_id', 'route_type', 'stop_id', + 'stop_name'] + + result = gtfs_network._format_transit_net_nodes( + df=selected_stops_from_feed_wo_calendar_dates) + + for col in expected_cols: + assert col in result.columns + assert result[col].isnull().sum() == 0 + assert result.index.name == 'node_id' + assert result.index.isnull().sum() == 0 + # round result to ensure decimal place match + result['x'] = result['x'].round(decimals=6) + result['y'] = result['y'].round(decimals=6) + # test that output df is identical to expected df + # re-sort cols so they are in same order for test + expected_result.sort_index(axis=1, inplace=True) + result.sort_index(axis=1, inplace=True) + assert result.equals(expected_result) + + +def test_route_type_to_edge( + gtfs_feed_wo_calendar_dates, + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): + expected_result = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + input_edge_df = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + + # 'route_type' is added in this function and is not expected to already + # exist + input_edge_df.drop(columns=['route_type'], inplace=True) + + result = gtfs_network._route_type_to_edge( + transit_edge_df=input_edge_df, + stop_time_df=gtfs_feed_wo_calendar_dates.stop_times) + assert 'route_type' in result.columns + assert result['route_type'].isnull().sum() == 0 + # re-sort cols so they are in same order for test + expected_result.sort_index(axis=1, inplace=True) + result.sort_index(axis=1, inplace=True) + assert result.equals(expected_result) + + +def test_route_id_to_edge( + gtfs_feed_wo_calendar_dates, + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2): + expected_result = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + series = pd.Series( + data=['10-101_agency_a_city_a'] * 5, index=range(5), + name='unique_route_id') + expected_result['unique_route_id'] = series + input_edge_df = \ + expected_transit_edge_from_feed_wo_calendar_dates_process_lvl_2.copy() + + result = gtfs_network._route_id_to_edge( + transit_edge_df=input_edge_df, + trips_df=gtfs_feed_wo_calendar_dates.trips) + assert 'unique_route_id' in result.columns + assert result['unique_route_id'].isnull().sum() == 0 + assert result.equals(expected_result) + + +def test_check_if_index_name_in_cols_False( + selected_stops_from_feed_wo_calendar_dates): + result = gtfs_network._check_if_index_name_in_cols( + selected_stops_from_feed_wo_calendar_dates) + assert isinstance(result, bool) + assert result is False + + +def test_check_if_index_name_in_cols_True( + selected_stops_from_feed_wo_calendar_dates): + selected_stops_from_feed_wo_calendar_dates.reset_index(inplace=True) + selected_stops_from_feed_wo_calendar_dates.set_index( + 'node_id', drop=False, inplace=True) + + result = gtfs_network._check_if_index_name_in_cols( + selected_stops_from_feed_wo_calendar_dates) + assert isinstance(result, bool) + assert result is True + + +def test_edge_impedance_by_route_type(edge_route_type_impedance_df): + df = edge_route_type_impedance_df.copy() + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + underground_rail=0.5, + intercity_rail=-0.5) + # route_id 1 weight should increase via multiplier + assert (result.weight.iloc[0:3] == df.weight.iloc[0:3] + ( + df.weight.iloc[0:3] * 0.5)).all() + # route_id 2 weight should decrease via multiplier + assert (result.weight.iloc[3:6] == df.weight.iloc[3:6] + ( + df.weight.iloc[3:6] * -0.5)).all() + # route_id 3 weight should not change + assert (result.weight.iloc[6:9] == df.weight.iloc[6:9]).all() + + +def test_edge_impedance_by_route_type_invalid_params( + edge_route_type_impedance_df): + # test with multiplier outside of optimal range + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + underground_rail=2.0, + intercity_rail=-3.0) + # should return a result even if multiplier is not in optimal range + assert result.empty is False + # test with weight param as invalid dtype + with pytest.raises(ValueError) as excinfo: + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + travel_time_col_name=2, + underground_rail=0.5, + intercity_rail=-0.5) + expected_error = "travel_time_col_name must be a string." + assert expected_error in str(excinfo.value) + # test with weight param as invalid dtype + # create str weight column + edge_route_type_impedance_df['travel_time'] = '1' + with pytest.raises(ValueError) as excinfo: + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + travel_time_col_name='travel_time', + underground_rail=0.5, + intercity_rail=-0.5) + expected_error = "travel_time must be a number." + assert expected_error in str(excinfo.value) + # test with weight column that cant be found in DataFrame + with pytest.raises(ValueError) as excinfo: + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + travel_time_col_name='time', + underground_rail=0.5, + intercity_rail=-0.5) + expected_error = ("Column: time was not found in " + "transit_edge_df DataFrame and is required.") + assert expected_error in str(excinfo.value) + # test with multiplier value as str + with pytest.raises(ValueError) as excinfo: + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + underground_rail='1', + intercity_rail=-0.5) + expected_error = "One or more multiplier variables are not float." + assert expected_error in str(excinfo.value) + # test with route type that is not found in DataFrame + result = gtfs_network.edge_impedance_by_route_type( + edge_route_type_impedance_df, + underground_rail=0.5, + funicular=-0.5) + # should return a result even if route type is not found in DataFrame + assert result.empty is False + + +def test_save_processed_gtfs_data( + tmpdir, + selected_int_stop_times_from_feed_wo_calendar_dates, + gtfs_feed_wo_calendar_dates): + # add stop_times_int to UA object which is required for saving HDF5 + gtfs_feed_wo_calendar_dates.stop_times_int = \ + selected_int_stop_times_from_feed_wo_calendar_dates + dir_path = os.path.join(tmpdir.strpath, 'test_hdf5_save') + os.makedirs(dir_path) + print('preparing test dir: {}'.format(dir_path)) + + gtfs_network.save_processed_gtfs_data( + gtfs_feed_wo_calendar_dates, + filename='test_file.h5', dir=dir_path) + # test that file was written as expected + file_list = glob.glob(r"{}/*.h5".format(dir_path)) + file_path = file_list[0] + file_name = os.path.basename(file_path) + assert file_name == 'test_file.h5' + # test HDF5 store + expected_keys = {'/calendar', '/routes', '/stop_times', '/stop_times_int', + '/stops', '/trips'} + with pd.HDFStore(file_path) as store: + result_keys = set(store.keys()) + assert result_keys == expected_keys + # check that data exists in each DataFrame + for key in expected_keys: + df = store[key] + assert df.empty is False + + +def test_save_processed_gtfs_data_invalid_params( + tmpdir, + gtfs_feed_wo_calendar_dates, + selected_int_stop_times_from_feed_wo_calendar_dates): + dir_path = os.path.join(tmpdir.strpath, 'test_hdf5_save') + os.makedirs(dir_path) + print('preparing test dir: {}'.format(dir_path)) + # test with missing req DataFrame: stop_times_int + gtfs_feed_wo_calendar_dates.stop_times_int = pd.DataFrame() + with pytest.raises(ValueError) as excinfo: + gtfs_network.save_processed_gtfs_data( + gtfs_feed_wo_calendar_dates, + filename='test_file.h5', dir=dir_path) + expected_error = ('gtfsfeeds_dfs is missing required ' + 'DataFrame: stop_times_int.') + assert expected_error in str(excinfo.value) + + # set stop_times_int df for test + gtfs_feed_wo_calendar_dates.stop_times_int = \ + selected_int_stop_times_from_feed_wo_calendar_dates + # set calendar df to blank df for test + gtfs_feed_wo_calendar_dates.calendar = pd.DataFrame() + with pytest.raises(ValueError) as excinfo: + gtfs_network.save_processed_gtfs_data( + gtfs_feed_wo_calendar_dates, + filename='test_file.h5', dir=dir_path) + expected_error = ('gtfsfeeds_dfs is missing either the calendar or ' + 'calendar_dates DataFrame.') + assert expected_error in str(excinfo.value) + + # test with incorrect dtype as param + with pytest.raises(ValueError) as excinfo: + gtfs_network.save_processed_gtfs_data( + 'invalid_param', + filename='test_file.h5', dir=dir_path) + expected_error = ('gtfsfeeds_dfs must be an urbanaccess_gtfs_df ' + 'object.') + assert expected_error in str(excinfo.value) + + +def test_load_processed_gtfs_data( + hdf5_file_on_disk_gtfsfeeds_dfs, expected_gtfsfeeds_dfs_keys): + gtfsfeeds_dfs = gtfs_network.load_processed_gtfs_data( + filename='test_file.h5', dir=hdf5_file_on_disk_gtfsfeeds_dfs) + assert isinstance(gtfsfeeds_dfs, urbanaccess_gtfs_df) + urbanaccess_gtfs_df_info = vars(gtfsfeeds_dfs) + + assert expected_gtfsfeeds_dfs_keys == sorted( + list(urbanaccess_gtfs_df_info.keys())) + # headways and calendar_dates were not written to HDF5 so we dont + # expect them in this test + expected_dfs = ['stops', 'routes', 'trips', 'stop_times', 'calendar', + 'stop_times_int'] + expected_dfs_empty = ['calendar_dates', 'headways'] + for key, value in urbanaccess_gtfs_df_info.items(): + assert isinstance(value, pd.core.frame.DataFrame) + # check that df is not empty + if key in expected_dfs: + assert value.empty is False + # check that df is empty + if key in expected_dfs_empty: + assert value.empty diff --git a/urbanaccess/tests/test_gtfs_utils_format.py b/urbanaccess/tests/test_gtfs_utils_format.py index d22d04e..b2e7abd 100644 --- a/urbanaccess/tests/test_gtfs_utils_format.py +++ b/urbanaccess/tests/test_gtfs_utils_format.py @@ -924,6 +924,9 @@ def test_remove_whitespace_from_values(trips_feed_w_invalid_values): df=raw_df, textfile='trips.txt', col_list=['trip_id', 'service_id', 'route_id']) + # re-sort cols so they are in same order for test + expected_df.sort_index(axis=1, inplace=True) + result.sort_index(axis=1, inplace=True) assert result.equals(expected_df) # test when no col_list is used @@ -942,4 +945,7 @@ def test_read_gtfs_trips_w_invalid_values(trips_feed_w_invalid_values): raw_df, expected_df, feed_path = trips_feed_w_invalid_values result = utils_format._read_gtfs_trips( textfile_path=feed_path, textfile='trips.txt') + # re-sort cols so they are in same order for test + expected_df.sort_index(axis=1, inplace=True) + result.sort_index(axis=1, inplace=True) assert result.equals(expected_df) diff --git a/urbanaccess/tests/test_gtfs_utils_validation.py b/urbanaccess/tests/test_gtfs_utils_validation.py new file mode 100644 index 0000000..a43a5ca --- /dev/null +++ b/urbanaccess/tests/test_gtfs_utils_validation.py @@ -0,0 +1,33 @@ +import pytest + +import urbanaccess.gtfs.utils_validation as utils_validation + + +def test_check_time_range_format(): + utils_validation._check_time_range_format(['07:00:00', '10:00:00']) + + +def test_check_time_range_format_invalid_params(): + msg = ('starttime and endtime are not in the correct format. ' + 'Format should be a 24 hour clock in the following format: ' + '08:00:00 or 17:00:00.') + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format(['7:00:0', '10:00:00']) + expected_error = ("['7:00:0', '10:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format(['10:00:00']) + expected_error = ("['10:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format('10:00:00') + expected_error = ("10:00:00 {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format([100000, 170000]) + expected_error = ("[100000, 170000] {}".format(msg)) + assert expected_error in str(excinfo.value) + with pytest.raises(ValueError) as excinfo: + utils_validation._check_time_range_format(['10:00:00', '07:00:00']) + expected_error = ("['10:00:00', '07:00:00'] {}".format(msg)) + assert expected_error in str(excinfo.value) diff --git a/urbanaccess/tests/test_gtfsfeeds.py b/urbanaccess/tests/test_gtfsfeeds.py index dacf5be..7dc7858 100644 --- a/urbanaccess/tests/test_gtfsfeeds.py +++ b/urbanaccess/tests/test_gtfsfeeds.py @@ -87,7 +87,7 @@ def test_to_yaml_feed(tmpdir, feed_dict3): yaml_path = os.path.join(tmpdir.strpath, 'gtfsfeeds.yaml') with open(yaml_path, 'r') as f: - yaml_config = yaml.load(f) + yaml_config = yaml.safe_load(f) assert yaml_config['gtfs_feeds'] == feed_dict3 # clear feeds from global memory feeds.remove_feed(remove_all=True) diff --git a/urbanaccess/utils.py b/urbanaccess/utils.py index d5f0420..d34197d 100644 --- a/urbanaccess/utils.py +++ b/urbanaccess/utils.py @@ -123,18 +123,18 @@ def _get_logger(level=None, name=None, filename=None): def create_hdf5(dir=None, filename=None, overwrite_hdf5=False): """ - Create a empty hdf5 file + Create an empty HDF5 file Parameters ---------- dir : string, optional - directory to save hdf5 file, if None defaults to dir set in + directory to save HDF5 file, if None defaults to dir set in config.settings.data_folder filename : string, optional - name of the hdf5 file to save with .h5 extension, if None defaults + name of the HDF5 file to save with .h5 extension, if None defaults to urbanaccess.h5 overwrite_hdf5 : bool, optional - if true any existing hdf5 file with the specified name in the + if true any existing HDF5 file with the specified name in the specified directory will be overwritten Returns @@ -145,35 +145,35 @@ def create_hdf5(dir=None, filename=None, overwrite_hdf5=False): dir = config.settings.data_folder else: if not isinstance(dir, str): - raise ValueError('Directory must be a string') + raise ValueError('Directory must be a string.') try: if not os.path.exists(dir): os.makedirs(dir) except Exception: - raise ValueError('Unable to make directory {}'.format(dir)) + raise ValueError('Unable to make directory {}.'.format(dir)) if filename is None: filename = 'urbanaccess.h5' else: if not isinstance(filename, str): - raise ValueError('Filename must be a string') + raise ValueError('Filename must be a string.') - hdf5_save_path = '{}/{}'.format(dir, filename) + hdf5_save_path = os.path.join(dir, filename) if not filename.endswith('.h5'): - raise ValueError('hdf5 filename extension must be "h5"') + raise ValueError('HDF5 filename extension must be "h5".') if not os.path.exists(hdf5_save_path): store = pd.HDFStore(hdf5_save_path) store.close() - log('New {} hdf5 store created in dir: {}'.format(filename, dir)) + log(' New {} HDF5 store created in dir: {}.'.format(filename, dir)) elif overwrite_hdf5 and os.path.exists(hdf5_save_path): store = pd.HDFStore(hdf5_save_path) store.close() - log('Existing {} hdf5 store in dir: has been overwritten.'.format( - hdf5_save_path)) + log(' Existing {} HDF5 store in dir: {} has been ' + 'overwritten.'.format(filename, dir)) else: - log('Using existing {} hdf5 store.'.format(hdf5_save_path)) + log(' Using existing HDF5 store: {}.'.format(hdf5_save_path)) return hdf5_save_path @@ -181,65 +181,65 @@ def create_hdf5(dir=None, filename=None, overwrite_hdf5=False): def df_to_hdf5(data=None, key=None, overwrite_key=False, dir=None, filename=None, overwrite_hdf5=False): """ - Write a pandas dataframe to a table in a hdf5 file + Write a Pandas dataframe to a table in a HDF5 file Parameters ---------- data : pandas.DataFrame - pandas dataframe to save to a hdf5 table + Pandas dataframe to save to a HDF5 table key : string - name of table to save dataframe as in the hdf5 file + name of table to save dataframe as in the HDF5 file overwrite_key : bool, optional if true any existing table with the specified key name will be overwritten dir : string - directory to save hdf5 file + directory to save HDF5 file filename : string - name of the hdf5 file to save with .h5 extension + name of the HDF5 file to save with .h5 extension overwrite_hdf5 : bool, optional - if true any existing hdf5 file with the specified name in the + if true any existing HDF5 file with the specified name in the specified directory will be overwritten Returns ------- None """ - hdf5_save_path = create_hdf5(dir=dir, filename=filename, - overwrite_hdf5=overwrite_hdf5) + hdf5_save_path = create_hdf5( + dir=dir, filename=filename, overwrite_hdf5=overwrite_hdf5) store = pd.HDFStore(hdf5_save_path, mode='r') if not ''.join(['/', key]) in store.keys(): store.close() data.to_hdf(hdf5_save_path, key=key, mode='a', format='table') - log('{} saved in {} hdf5 store.'.format(key, hdf5_save_path)) + log(' DataFrame: {} saved in HDF5 store: {}.'.format( + key, hdf5_save_path)) elif ''.join(['/', key]) in store.keys() and overwrite_key: store.close() data.to_hdf(hdf5_save_path, key=key, mode='a', format='table') - log('Existing {} overwritten in {} hdf5 store.'.format(key, - hdf5_save_path)) + log(' Existing DataFrame: {} overwritten in HDF5 store: {}.'.format( + key, hdf5_save_path)) else: store.close() - log( - 'Key {} already exists in {} hdf5 store. Set to overwrite_key = ' - 'True to replace.'.format( - key, hdf5_save_path)) + log(' Key {} already exists in HDF5 store: {}. ' + 'Set to overwrite_key = True to replace existing ' + 'data in key.'.format(key, hdf5_save_path)) def hdf5_to_df(dir=None, filename=None, key=None): """ - Read data from a hdf5 file to a pandas dataframe + Read data from a HDF5 file to a Pandas dataframe Parameters ---------- dir : string - directory of the hdf5 file to read from + directory of the HDF5 file to read from filename : string - name of the hdf5 file with .h5 extension to read from + name of the HDF5 file with .h5 extension to read from key : string - table inside the hdf5 file to return as a pandas dataframe + table inside the HDF5 file to return as a Pandas dataframe Returns ------- @@ -249,32 +249,29 @@ def hdf5_to_df(dir=None, filename=None, key=None): dir = config.settings.data_folder else: if not isinstance(dir, str): - raise ValueError('Directory must be a string') + raise ValueError('Directory must be a string.') if filename is None: filename = 'urbanaccess_net.h5' else: if not isinstance(filename, str): - raise ValueError('Filename must be a string') + raise ValueError('Filename must be a string.') - hdf5_load_path = '{}/{}'.format(dir, filename) + hdf5_load_path = os.path.join(dir, filename) if not filename.endswith('.h5'): - raise ValueError('hdf5 filename extension must be "h5"') + raise ValueError('HDF5 filename extension must be "h5".') if not os.path.exists(hdf5_load_path): - raise ValueError('Unable to find directory or file: {}'.format( + raise ValueError('Unable to find directory or file: {}.'.format( hdf5_load_path)) with pd.HDFStore(hdf5_load_path) as store: - # TODO: fix print statement to only display current key, not all keys - log('Successfully read store: {} with the following keys: {}'.format( - hdf5_load_path, store.keys())) + log(' Reading HDF5 store: {}...'.format(hdf5_load_path)) try: df = store[key] - ('Returned {} as dataframe'.format(key)) + log(' Successfully returned: {} as DataFrame.'.format(key)) except Exception: - raise ValueError( - 'Unable to find key: {}. Keys found: {}'.format(key, - store.keys())) + raise ValueError('Unable to find key: {}. Keys found: {}.'.format( + key, store.keys())) return df