Merge pull request #93 from UDST/data-loading

Template for data loading
UDST · Feb 20, 2019 · 1879a72 · 1879a72
2 parents 1a5f2bf + 9409d2f
commit 1879a72
Show file tree

Hide file tree

Showing 15 changed files with 663 additions and 12 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,10 @@
 # UrbanSim Templates change log
 
+### 0.2.dev0 (2019-02-19)
+
+- adds first data i/o template: `urbansim_templates.io.TableFromDisk()`
+- adds support for `autorun` template property
+
 ### 0.1.1 (2019-02-05)
 
 - production release

diff --git a/docs/build/.gitignore b/docs/build/.gitignore
@@ -0,0 +1 @@
+**/*
diff --git a/docs/source/data-io.rst b/docs/source/data-io.rst
@@ -0,0 +1,17 @@
+Data I/O template APIs
+======================
+
+Data i/o templates let you set up automated model steps for loading data into Orca or saving outputs to disk. 
+
+These templates follow the same principles as the statistical model steps. For example, to set up a data table, create an instance of the ``TableFromDisk`` class and set some properties: the table name, file type, path, and anything else that's needed. 
+
+Registering this object with ModelManager will save it to disk as a yaml file, and create an Orca step with instructions to set up the table. "Running" the object/step registers the table with Orca, but doesn't read the data from disk yet — Orca loads data lazily as it's needed.
+
+Data registration steps are run automatically when you initialize ModelManager.
+
+
+Table from disk
+---------------
+
+.. autoclass:: urbansim_templates.io.TableFromDisk
+   :members:
diff --git a/docs/source/getting-started.rst b/docs/source/getting-started.rst
@@ -105,7 +105,7 @@ The default file location is a ``configs`` folder located in the current working
         In [2]: import urbansim_templates
                 print(urbansim_templates.__version__)
         
-        Out[2]: '0.1.dev12'
+        Out[2]: '0.2.dev0'
 
 
 Creating a model step

diff --git a/docs/source/index.rst b/docs/source/index.rst
@@ -10,7 +10,7 @@ UrbanSim Templates provides building blocks for Orca-based simulation models. It
 
 The library contains templates for common types of model steps, plus a tool called ModelManager that runs as an extension to the `Orca <https://udst.github.io/orca>`__ task orchestrator. ModelManager can register template-based model steps with the orchestrator, save them to disk, and automatically reload them for future sessions.
 
-v0.1.1, released February 5, 2019
+v0.2.dev0, released February 19, 2019
 
 
 Contents
@@ -22,5 +22,6 @@ Contents
    getting-started
    modelmanager
    model-steps
+   data-io
    utilities
    development
diff --git a/docs/source/model-steps.rst b/docs/source/model-steps.rst
@@ -1,5 +1,5 @@
-Template APIs
-=============
+Model step template APIs
+========================
 
 The following templates are included in the core package. ModelManager can also work with templates defined elsewhere, as long as they follow the specifications described in the design guidelines.
 
@@ -32,7 +32,6 @@ Large Multinomial Logit
    :members:
 
 
-
 Segmented Large Multinomial Logit
 ---------------------------------
 

diff --git a/setup.py b/setup.py
@@ -6,7 +6,7 @@
 
 setup(
     name='urbansim_templates',
-    version='0.1.1',
+    version='0.2.dev0',
     description='UrbanSim extension for managing model steps',
     author='UrbanSim Inc.',
     author_email='info@urbansim.com',

diff --git a/tests/data/README.md b/tests/data/README.md
@@ -0,0 +1 @@
+This folder stores data that is temporarily generated during tests.
diff --git a/tests/test_tables.py b/tests/test_tables.py
@@ -0,0 +1,303 @@
+import os
+
+import numpy as np
+import pandas as pd
+import pytest
+
+import orca
+
+from urbansim_templates import modelmanager
+from urbansim_templates.io import TableFromDisk
+from urbansim_templates.utils import validate_template
+
+
+@pytest.fixture
+def orca_session():
+    """
+    Set up a clean Orca session and initialize ModelManager.
+    
+    """
+    orca.clear_all()
+    modelmanager.initialize()
+
+
+@pytest.fixture
+def data(request):
+    """
+    Create some data files on disk.
+    
+    """
+    d1 = {'building_id': np.arange(10),
+          'price': 1e6*np.random.random(10)}
+
+    bldg = pd.DataFrame(d1).set_index('building_id')
+    bldg.to_csv('data/buildings.csv')
+    bldg.to_csv('data/buildings.csv.gz', compression='gzip')
+    bldg.to_hdf('data/buildings.hdf', key='buildings')
+
+    def teardown():
+        os.remove('data/buildings.csv')
+        os.remove('data/buildings.csv.gz')
+        os.remove('data/buildings.hdf')
+
+    request.addfinalizer(teardown)
+
+
+def test_template_validity():
+    """
+    Run the template through the standard validation check.
+    
+    """
+    assert validate_template(TableFromDisk)
+
+
+def test_property_persistence(orca_session):
+    """
+    Test persistence of properties across registration, saving, and reloading.
+    
+    """
+    pass
+
+
+######################################
+### TESTS OF THE VALIDATE() METHOD ###
+######################################
+
+def test_validation_index_unique(orca_session):
+    """
+    Table validation should pass if the index is unique.
+    
+    These tests of the validate() method generate Orca tables directly, which is just a 
+    shortcut for testing -- the intended use is for the method to validate the table
+    loaded by the TableStep. 
+    
+    """
+    d = {'id': [1,2,3], 'value': [4,4,4]}
+    orca.add_table('tab', pd.DataFrame(d).set_index('id'))
+
+    t = TableFromDisk(name='tab')
+    t.validate()
+
+
+def test_validation_index_not_unique(orca_session):
+    """
+    Table validation should raise a ValueError if the index is not unique.
+    
+    """
+    d = {'id': [1,1,3], 'value': [4,4,4]}
+    orca.add_table('tab', pd.DataFrame(d).set_index('id'))
+
+    t = TableFromDisk(name='tab')
+    try:
+        t.validate()
+    except ValueError:
+        return
+
+    pytest.fail()  # fail if ValueError wasn't raised
+
+
+def test_validation_multiindex_unique(orca_session):
+    """
+    Table validation should pass with a MultiIndex whose combinations are unique.
+    
+    """
+    d = {'id': [1,1,1], 'sub_id': [1,2,3], 'value': [4,4,4]}
+    orca.add_table('tab', pd.DataFrame(d).set_index(['id', 'sub_id']))
+
+    t = TableFromDisk(name='tab')
+    t.validate()
+
+
+def test_validation_multiindex_not_unique(orca_session):
+    """
+    Table validation should raise a ValueError if the MultiIndex combinations are not 
+    unique.
+    
+    """
+    d = {'id': [1,1,1], 'sub_id': [2,2,3], 'value': [4,4,4]}
+    orca.add_table('tab', pd.DataFrame(d).set_index(['id', 'sub_id']))
+
+    t = TableFromDisk(name='tab')
+    try:
+        t.validate()
+    except ValueError:
+        return
+
+    pytest.fail()  # fail if ValueError wasn't raised
+
+
+def test_validation_unnamed_index(orca_session):
+    """
+    Table validation should raise a ValueError if index is unnamed.
+    
+    """
+    d = {'id': [1,1,3], 'value': [4,4,4]}
+    orca.add_table('tab', pd.DataFrame(d))  # generates auto index without a name
+
+    t = TableFromDisk(name='tab')
+    try:
+        t.validate()
+    except ValueError:
+        return
+
+    pytest.fail()  # fail if ValueError wasn't raised
+
+
+def test_validation_columns_vs_other_indexes(orca_session):
+    """
+    Table validation should compare the 'households.building_id' column to 
+    'buildings.build_id'.
+    
+    """
+    d = {'household_id': [1,2,3], 'building_id': [2,3,4]}
+    orca.add_table('households', pd.DataFrame(d).set_index('household_id'))
+
+    d = {'building_id': [1,2,3,4], 'value': [4,4,4,4]}
+    orca.add_table('buildings', pd.DataFrame(d).set_index('building_id'))
+
+    t = TableFromDisk(name='households')
+    t.validate()
+
+
+def test_validation_index_vs_other_columns(orca_session):
+    """
+    Table validation should compare the 'households.building_id' column to 
+    'buildings.build_id'.
+    
+    """
+    d = {'building_id': [1,2,3,4], 'value': [4,4,4,4]}
+    orca.add_table('buildings', pd.DataFrame(d).set_index('building_id'))
+
+    d = {'household_id': [1,2,3], 'building_id': [2,3,5]}
+    orca.add_table('households', pd.DataFrame(d).set_index('household_id'))
+
+    t = TableFromDisk(name='buildings')
+    t.validate()
+
+
+def test_validation_with_multiindexes(orca_session):
+    """
+    Here, table validation should compare 'choice_table.[home_tract,work_tract]' to
+    'distances.[home_tract,work_tract]'.
+    
+    """
+    d = {'obs_id': [1,1,1,1], 'alt_id': [1,2,3,4], 
+         'home_tract': [55,55,55,55], 'work_tract': [17,46,19,55]}
+    orca.add_table('choice_table', pd.DataFrame(d).set_index(['obs_id','alt_id']))
+
+    d = {'home_tract': [55,55,55], 'work_tract': [17,18,19], 'dist': [1,1,1]}
+    orca.add_table('distances', pd.DataFrame(d).set_index(['home_tract','work_tract']))
+
+    t = TableFromDisk(name='choice_table')
+    t.validate()
+
+
+# test that parameters make it through a save
+# test validation with stand-alone columns
+
+# test loading an h5 file works
+# test passing cache settings
+
+
+#################################
+### TESTS OF THE DATA LOADING ###
+#################################
+
+def test_csv(orca_session, data):
+    """
+    Test loading data from a CSV file.
+    
+    """
+    t = TableFromDisk()
+    t.name = 'buildings'
+    t.source_type = 'csv'
+    t.path = 'data/buildings.csv'
+    t.csv_index_cols = 'building_id'
+
+    assert 'buildings' not in orca.list_tables()
+
+    modelmanager.register(t)
+    assert 'buildings' in orca.list_tables()
+    _ = orca.get_table('buildings').to_frame()
+
+    modelmanager.initialize()
+    assert 'buildings' in orca.list_tables()
+
+    modelmanager.remove_step('buildings')
+
+
+def test_hdf(orca_session, data):
+    """
+    Test loading data from an HDF file.
+    
+    """
+    t = TableFromDisk()
+    t.name = 'buildings'
+    t.source_type = 'hdf'
+    t.path = 'data/buildings.hdf'
+
+    assert 'buildings' not in orca.list_tables()
+
+    modelmanager.register(t)
+    assert 'buildings' in orca.list_tables()
+    _ = orca.get_table('buildings').to_frame()
+
+    modelmanager.initialize()
+    assert 'buildings' in orca.list_tables()
+
+    modelmanager.remove_step('buildings')
+
+
+def test_extra_settings(orca_session, data):
+    """
+    Test loading data with extra settings, e.g. for compressed files.
+    
+    """
+    t = TableFromDisk()
+    t.name = 'buildings'
+    t.source_type = 'csv'
+    t.path = 'data/buildings.csv.gz'
+    t.csv_index_cols = 'building_id'
+    t.extra_settings = {'compression': 'gzip'}
+
+    assert 'buildings' not in orca.list_tables()
+
+    modelmanager.register(t)
+    assert 'buildings' in orca.list_tables()
+    _ = orca.get_table('buildings').to_frame()
+
+    modelmanager.initialize()
+    assert 'buildings' in orca.list_tables()
+
+    modelmanager.remove_step('buildings')
+
+
+def test_windows_paths(orca_session, data):
+    """
+    Test in Windows that a Windows-style path is properly normalized.
+    
+    TO DO - implement
+    
+    """
+    pass
+
+
+def test_without_autorun(orca_session, data):
+    """
+    Confirm that disabling autorun works.
+    
+    """
+    t = TableFromDisk()
+    t.name = 'buildings'
+    t.source_type = 'csv'
+    t.path = 'data/buildings.csv'
+    t.csv_index_cols = 'building_id'
+    t.autorun = False
+
+    modelmanager.register(t)
+    assert 'buildings' not in orca.list_tables()
+
+    modelmanager.remove_step('buildings')
+
+
+
diff --git a/urbansim_templates/__init__.py b/urbansim_templates/__init__.py
@@ -1 +1 @@
-version = __version__ = '0.1.1'
+version = __version__ = '0.2.dev0'
diff --git a/urbansim_templates/io/.gitignore b/urbansim_templates/io/.gitignore
@@ -0,0 +1 @@
+__pycache__/*
diff --git a/urbansim_templates/io/__init__.py b/urbansim_templates/io/__init__.py
@@ -0,0 +1 @@
+from .tables import TableFromDisk
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		This folder stores data that is temporarily generated during tests.
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		version = __version__ = '0.1.1'
		version = __version__ = '0.2.dev0'