diff --git a/project_list/_csis_utils.py b/project_list/_csis_utils.py
new file mode 100644
index 000000000..5ecec155b
--- /dev/null
+++ b/project_list/_csis_utils.py
@@ -0,0 +1,94 @@
+import pandas as pd 
+def csis_clean_project(df:pd.DataFrame)->pd.DataFrame:
+    df = df.fillna(df.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64': 0}))
+    string_cols = [
+     'needpurpose',
+     'proj_desc',
+     'route1',
+     'title']
+    # Clean strings
+    for i in string_cols:
+        df[i] = df[i].str.title().str.lstrip().str.rstrip()
+        df[i] = df[i].replace(r'\s+', ' ', regex=True)
+        
+    # Drop projects by ctips_id
+    df2 = df.drop_duplicates(subset = ['ctips_id'])
+    
+    # Filter out any rows where chg_qual1==7 because those are projects that are deleted
+    df2 = df2.loc[(df2.chg_qual1 != 7)]
+    df2 = df2.loc[(df2.archive == 0)]
+    df2 = df2.loc[(df2.document != "DSHOPP")]
+    df2 = df2.loc[(df2.chg_offcl != 14)]
+    df2 = df2.loc[(df2.chg_qual1 != 15)]
+    df2 = df2.loc[(df2.chg_qual1 != 16)]
+    df2 = df2.loc[(df2.chg_qual1 != 18)]
+    df2 = df2.loc[(df2.chg_qual1 != 20)]
+    df2 = df2.loc[(df2.chg_qual1 != 28)]
+    return df2
+
+def add_agencies(left_df: pd.DataFrame, right_df: pd.DataFrame, col: str) -> pd.DataFrame:
+    merged_df = pd.merge(
+        left_df,
+        right_df,
+        left_on=col,
+        right_on='agencyid',
+        how='left'
+    )
+
+    renamed_df = merged_df.rename(
+        columns={
+            'agency_name_y': f'{col}_agency',
+            'agencyid_x': 'agencyid',
+            'agency_name_x': 'agency_name'
+        }
+    )
+
+    final_df = renamed_df.drop(columns=['agencyid_y'])
+
+    return final_df
+
+def add_counties(left_df: pd.DataFrame, right_df: pd.DataFrame, col: str) -> pd.DataFrame:
+    merged_df = pd.merge(
+        left_df,
+        right_df,
+        left_on=col,
+        right_on='countyid',
+        how='left'
+    )
+
+    renamed_df = merged_df.rename(
+        columns={
+            'county_name_y': f'{col}_county',
+            'countyid_x': 'countyid',
+            'county_name_x': 'county_name'
+        }
+    )
+
+    final_df = renamed_df.drop(columns=['countyid_y',  col])
+
+    return final_df
+
+def calculate_state_fed_local_total_funds(df:pd.DataFrame, fund_keywords:list, total_col_name:str)->pd.DataFrame:
+    selected_columns = [col for col in df.columns if any(keyword.lower() in col.lower() for keyword in fund_keywords)]
+    df[total_col_name] = df[selected_columns].fillna(0).sum(axis = 1)
+    return df 
+
+def clean_political(df:pd.DataFrame, keyword_to_search:str)->pd.DataFrame:
+    my_list = []
+    # Append a string to the list
+    my_list.append(keyword_to_search)
+    
+    filtered_columns = [col for col in df.columns if any(keyword.lower() in col.lower() for keyword in my_list)]
+    all_cols = filtered_columns + ['ctips_id']
+    df2 = df[all_cols]
+    
+    # Make this from wide to long
+    df2 = pd.melt(df2, id_vars=['ctips_id'], value_vars=filtered_columns)
+    
+    # Clean up columns
+    df2.variable = df2.variable.str.replace(keyword_to_search, '')
+    df2 = df2.rename(columns = {'variable':keyword_to_search})
+    
+    # Only keep relevant values for each project
+    df2 = df2.loc[df2.value == 1.0].reset_index(drop = True).drop(columns = ['value'])
+    return df2
\ No newline at end of file
diff --git a/project_list/_database_utils.py b/project_list/_database_utils.py
new file mode 100644
index 000000000..9aa964a97
--- /dev/null
+++ b/project_list/_database_utils.py
@@ -0,0 +1,22 @@
+def to_snakecase(df):
+    df.columns = df.columns.str.lower().str.replace(' ','_')
+    return df
+
+# Tag whether something is funded by state/federal/both
+def is_state_funds(row):
+    if row.total_state_funds > 0:
+        return "Yes"
+    else:
+        return "No"
+    
+def is_fed_funds(row):
+    if row.total_federal_funds > 0:
+        return "Yes"
+    else:
+        return "No"
+    
+def is_local_funds(row):
+    if row.total_local_funds > 0:
+        return "Yes"
+    else:
+        return "No"
\ No newline at end of file
diff --git a/project_list/_harmonization_utils.py b/project_list/_harmonization_utils.py
index 1931c02cd..ae4a32e20 100644
--- a/project_list/_harmonization_utils.py
+++ b/project_list/_harmonization_utils.py
@@ -12,7 +12,6 @@
 def load_state_rail_plan():
     df = srp_utils.clean_state_rail_plan(srp_utils.state_rail_plan_file)
     return df
-
 def load_lost():
     df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}LOST/LOST_all_projects.xlsx", sheet_name = "Main"))
     
@@ -20,10 +19,6 @@ def load_lost():
     df.estimated_lost_funds = df.estimated_lost_funds* 1_000_000
     
     return df
-
-def load_sb1():
-    return sb1_utils.sb1_final()
-
 """
 Harmonizing
 Functions
diff --git a/project_list/add_LRTP_congestion.ipynb b/project_list/add_LRTP_congestion.ipynb
index 3666615fe..d597bfdfd 100644
--- a/project_list/add_LRTP_congestion.ipynb
+++ b/project_list/add_LRTP_congestion.ipynb
@@ -241,32 +241,32 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>$ 2.35 million</td>\n",
-       "      <td>2350000.00</td>\n",
-       "      <td>25</td>\n",
+       "      <th>235</th>\n",
+       "      <td>$ 0.077 million</td>\n",
+       "      <td>4812.50</td>\n",
+       "      <td>236</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>45</th>\n",
-       "      <td>$ 16.3 million</td>\n",
-       "      <td>16300000.00</td>\n",
-       "      <td>45</td>\n",
+       "      <th>67</th>\n",
+       "      <td>$1.5 million</td>\n",
+       "      <td>750000.00</td>\n",
+       "      <td>69</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>51</th>\n",
-       "      <td>$ 7.5 million</td>\n",
-       "      <td>7500000.00</td>\n",
-       "      <td>49</td>\n",
+       "      <th>216</th>\n",
+       "      <td>$0.006 million</td>\n",
+       "      <td>5760.00</td>\n",
+       "      <td>217</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "     fund_estimate  total_project_cost  rtp_id\n",
-       "24  $ 2.35 million          2350000.00      25\n",
-       "45  $ 16.3 million         16300000.00      45\n",
-       "51   $ 7.5 million          7500000.00      49"
+       "       fund_estimate  total_project_cost  rtp_id\n",
+       "235  $ 0.077 million             4812.50     236\n",
+       "67      $1.5 million           750000.00      69\n",
+       "216   $0.006 million             5760.00     217"
       ]
      },
      "execution_count": 9,
@@ -690,14 +690,14 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>80</th>\n",
-       "      <td>Capacity Increasing</td>\n",
+       "      <th>18</th>\n",
+       "      <td>Proposed Improvements</td>\n",
+       "      <td>Lemoore</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>41</td>\n",
-       "      <td>3.8/6.4</td>\n",
-       "      <td>Avenal Creek to s/o SR 33</td>\n",
-       "      <td>Construct Passing Lanes</td>\n",
        "      <td>NaN</td>\n",
+       "      <td>Spring Lane</td>\n",
+       "      <td>100 ft. east of Beverly Dr.</td>\n",
+       "      <td>Overlay</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
@@ -706,14 +706,11 @@
        "</div>"
       ],
       "text/plain": [
-       "               category jurisdiction state_route post_mile  \\\n",
-       "80  Capacity Increasing          NaN          41   3.8/6.4   \n",
-       "\n",
-       "                     location           project_limits description title  \\\n",
-       "80  Avenal Creek to s/o SR 33  Construct Passing Lanes         NaN  None   \n",
+       "                 category jurisdiction state_route post_mile     location  \\\n",
+       "18  Proposed Improvements      Lemoore         NaN       NaN  Spring Lane   \n",
        "\n",
-       "    total_cost  \n",
-       "80           0  "
+       "                 project_limits description title  total_cost  \n",
+       "18  100 ft. east of Beverly Dr.     Overlay  None           0  "
       ]
      },
      "execution_count": 19,
@@ -947,14 +944,14 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>11</th>\n",
+       "      <th>42</th>\n",
        "      <td>Proposed Improvements</td>\n",
-       "      <td>Hanford</td>\n",
+       "      <td>Lemoore</td>\n",
        "      <td>NaN</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>13th Avenue</td>\n",
-       "      <td>Houston Ave. to Lacey Blvd.</td>\n",
-       "      <td>Widen from 2 to 4 lanes with median</td>\n",
+       "      <td>Magnolia Street</td>\n",
+       "      <td>Lemoore Ave. to Smith Ave.</td>\n",
+       "      <td>Overlay</td>\n",
        "      <td>None</td>\n",
        "      <td>0</td>\n",
        "    </tr>\n",
@@ -963,14 +960,11 @@
        "</div>"
       ],
       "text/plain": [
-       "                 category jurisdiction state_route post_mile     location  \\\n",
-       "11  Proposed Improvements      Hanford         NaN       NaN  13th Avenue   \n",
-       "\n",
-       "                 project_limits                          description title  \\\n",
-       "11  Houston Ave. to Lacey Blvd.  Widen from 2 to 4 lanes with median  None   \n",
+       "                 category jurisdiction state_route post_mile         location  \\\n",
+       "42  Proposed Improvements      Lemoore         NaN       NaN  Magnolia Street   \n",
        "\n",
-       "    total_cost  \n",
-       "11           0  "
+       "                project_limits description title  total_cost  \n",
+       "42  Lemoore Ave. to Smith Ave.     Overlay  None           0  "
       ]
      },
      "execution_count": 24,
@@ -1037,39 +1031,39 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>761</th>\n",
-       "      <td>Kern County ‐ Class II Bike Ln</td>\n",
-       "      <td>Houghton Rd (Old River Rd to Union Av) 6. mi.                                                                                  $                       543</td>\n",
-       "      <td>543000</td>\n",
-       "      <td>543000</td>\n",
+       "      <th>360</th>\n",
+       "      <td>Metro Passenger Rail</td>\n",
+       "      <td>Amtrak Station ‐ Phase II                                                                                                                         $                   13,000</td>\n",
+       "      <td>13000000</td>\n",
+       "      <td>13000000</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>543000</td>\n",
+       "      <td>13000000</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>327</th>\n",
-       "      <td>Route 99</td>\n",
-       "      <td>Rt 99 @ Minkler Spur ‐ construct grade separation</td>\n",
-       "      <td>52152000</td>\n",
-       "      <td>69000000</td>\n",
-       "      <td>16848000</td>\n",
-       "      <td>52152000</td>\n",
+       "      <th>1126</th>\n",
+       "      <td>Mendiburu Path / California City Blvd‐88</td>\n",
+       "      <td>Class I Shared Use Path ‐ 1.6 mile ‐ Add new off‐St class I shared use path</td>\n",
+       "      <td>1445000</td>\n",
+       "      <td>1445000</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>1445000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                      project_title  \\\n",
-       "761  Kern County ‐ Class II Bike Ln   \n",
-       "327                        Route 99   \n",
+       "                                 project_title  \\\n",
+       "360                       Metro Passenger Rail   \n",
+       "1126  Mendiburu Path / California City Blvd‐88   \n",
        "\n",
-       "                                                                                                                                                          scope  \\\n",
-       "761  Houghton Rd (Old River Rd to Union Av) 6. mi.                                                                                  $                       543   \n",
-       "327                                                                                                           Rt 99 @ Minkler Spur ‐ construct grade separation   \n",
+       "                                                                                                                                                                             scope  \\\n",
+       "360   Amtrak Station ‐ Phase II                                                                                                                         $                   13,000   \n",
+       "1126                                                                                                   Class I Shared Use Path ‐ 1.6 mile ‐ Add new off‐St class I shared use path   \n",
        "\n",
-       "    yoe_w__new_revenue yoe_w_o_new_reven maint__inflation_savings      cost  \n",
-       "761             543000            543000                      NaN    543000  \n",
-       "327           52152000          69000000                 16848000  52152000  "
+       "     yoe_w__new_revenue yoe_w_o_new_reven maint__inflation_savings      cost  \n",
+       "360            13000000          13000000                      NaN  13000000  \n",
+       "1126            1445000           1445000                      NaN   1445000  "
       ]
      },
      "execution_count": 26,
@@ -1886,32 +1880,32 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>15</th>\n",
-       "      <td>Gustine</td>\n",
-       "      <td>Borelli Ranch Park Multi-use Path</td>\n",
-       "      <td>Construct a Multi-use Path from Fentem Rd to the end of Via Palermo</td>\n",
-       "      <td>Active (Bike/Ped)</td>\n",
-       "      <td>2030</td>\n",
-       "      <td>450</td>\n",
-       "      <td>CMAQ, Local, Measure V</td>\n",
-       "      <td>450000</td>\n",
+       "      <th>54</th>\n",
+       "      <td>Merced</td>\n",
+       "      <td>SR-59 Merced Widening Phase 4</td>\n",
+       "      <td>Widen 2 to 4 lanes from Cardella Rd to Bellevue Rd</td>\n",
+       "      <td>Road Capacity</td>\n",
+       "      <td>2045</td>\n",
+       "      <td>30000</td>\n",
+       "      <td>SB-1, Measure V, Local, SHOPP</td>\n",
+       "      <td>30000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "     agency                              title  \\\n",
-       "15  Gustine  Borelli Ranch Park Multi-use Path   \n",
+       "    agency                          title  \\\n",
+       "54  Merced  SR-59 Merced Widening Phase 4   \n",
        "\n",
-       "                                                     limits_description  \\\n",
-       "15  Construct a Multi-use Path from Fentem Rd to the end of Via Palermo   \n",
+       "                                    limits_description           type  \\\n",
+       "54  Widen 2 to 4 lanes from Cardella Rd to Bellevue Rd  Road Capacity   \n",
        "\n",
-       "                 type completion\\nyear total_cost\\n_$1,000s_  \\\n",
-       "15  Active (Bike/Ped)             2030                   450   \n",
+       "   completion\\nyear total_cost\\n_$1,000s_                funding_sources  \\\n",
+       "54             2045                 30000  SB-1, Measure V, Local, SHOPP   \n",
        "\n",
-       "           funding_sources total_cost_millions  \n",
-       "15  CMAQ, Local, Measure V              450000  "
+       "   total_cost_millions  \n",
+       "54            30000000  "
       ]
      },
      "execution_count": 39,
@@ -2135,37 +2129,37 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>1452</th>\n",
-       "      <td>YOL17360</td>\n",
-       "      <td>Project Development Only</td>\n",
-       "      <td>YOL</td>\n",
-       "      <td>City of Woodland</td>\n",
+       "      <th>723</th>\n",
+       "      <td>SAC24111</td>\n",
+       "      <td>Programmed</td>\n",
+       "      <td>SAC</td>\n",
+       "      <td>City of Elk Grove</td>\n",
        "      <td>B- Road &amp; Highway Capacity</td>\n",
-       "      <td>Parkland Ave.</td>\n",
-       "      <td>Construct New Road: 2 lane arterial from Pioneer Ave. to East St.</td>\n",
-       "      <td>9044751</td>\n",
+       "      <td>Lotz Parkway</td>\n",
+       "      <td>In Elk Grove, Lotz Parkway from Whitelock Parkway to Poppy Ridge Road: Construct new 4-lane roadway; and Lotz Parkway from Poppy Ridge Road to\\n0.5 miles south of Whitelock Pkwy at the northern boundary of the Sterling\\nMeadows development area:  Construct new 2-lane roadway.</td>\n",
+       "      <td>8662500</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>Post-2040</td>\n",
+       "      <td>2020-2025</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "            id status__planned,_programmed_or_project_development_only_  \\\n",
-       "1452  YOL17360                                 Project Development Only   \n",
+       "           id status__planned,_programmed_or_project_development_only_ county  \\\n",
+       "723  SAC24111                                               Programmed    SAC   \n",
        "\n",
-       "     county       lead_agency             budget_category          title  \\\n",
-       "1452    YOL  City of Woodland  B- Road & Highway Capacity  Parkland Ave.   \n",
+       "           lead_agency             budget_category         title  \\\n",
+       "723  City of Elk Grove  B- Road & Highway Capacity  Lotz Parkway   \n",
        "\n",
-       "                                                            description  \\\n",
-       "1452  Construct New Road: 2 lane arterial from Pioneer Ave. to East St.   \n",
+       "                                                                                                                                                                                                                                                                              description  \\\n",
+       "723  In Elk Grove, Lotz Parkway from Whitelock Parkway to Poppy Ridge Road: Construct new 4-lane roadway; and Lotz Parkway from Poppy Ridge Road to\\n0.5 miles south of Whitelock Pkwy at the northern boundary of the Sterling\\nMeadows development area:  Construct new 2-lane roadway.   \n",
        "\n",
-       "     total_project_cost__2018_dollars_  \\\n",
-       "1452                           9044751   \n",
+       "    total_project_cost__2018_dollars_  \\\n",
+       "723                           8662500   \n",
        "\n",
-       "     year_of_expenditure_cost_for_planned_projects completion_timing  \n",
-       "1452                                           NaN         Post-2040  "
+       "    year_of_expenditure_cost_for_planned_projects completion_timing  \n",
+       "723                                           NaN         2020-2025  "
       ]
      },
      "execution_count": 51,
@@ -2482,29 +2476,35 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>197</th>\n",
-       "      <td>Planned</td>\n",
-       "      <td>C-PL-6: Franklin Creek Multiuse Path</td>\n",
-       "      <td>Construction</td>\n",
-       "      <td>Construct a multiuse path along Franklin Creek from Carpinteria Ave to 7th St.</td>\n",
+       "      <th>367</th>\n",
+       "      <td>VMT Reducing</td>\n",
+       "      <td>SB-PL-15: Upper De la Vina St Gap Closure and Safe Crossings</td>\n",
+       "      <td>PA&amp;ED, PS&amp;E,\\nConstruction</td>\n",
+       "      <td>Implement  a road diet on De La Vina Street from Constance Avenue to Padre Street.  Crossing enhancements included.</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>2023</td>\n",
-       "      <td>750</td>\n",
-       "      <td>750000</td>\n",
+       "      <td>2050</td>\n",
+       "      <td>1988</td>\n",
+       "      <td>1988000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "        type                         project_title         phase  \\\n",
-       "197  Planned  C-PL-6: Franklin Creek Multiuse Path  Construction   \n",
+       "             type  \\\n",
+       "367  VMT Reducing   \n",
+       "\n",
+       "                                                    project_title  \\\n",
+       "367  SB-PL-15: Upper De la Vina St Gap Closure and Safe Crossings   \n",
        "\n",
-       "                                                                        description  \\\n",
-       "197  Construct a multiuse path along Franklin Creek from Carpinteria Ave to 7th St.   \n",
+       "                          phase  \\\n",
+       "367  PA&ED, PS&E,\\nConstruction   \n",
+       "\n",
+       "                                                                                                             description  \\\n",
+       "367  Implement  a road diet on De La Vina Street from Constance Avenue to Padre Street.  Crossing enhancements included.   \n",
        "\n",
        "    primary_funding_source_s_  year total_cost__$000s_ total_cost_millions  \n",
-       "197                       NaN  2023                750              750000  "
+       "367                       NaN  2050               1988             1988000  "
       ]
      },
      "execution_count": 57,
@@ -3010,7 +3010,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 74,
+   "execution_count": 69,
    "id": "e3f73aaa-91e9-4290-8335-ac961215b1c9",
    "metadata": {},
    "outputs": [
@@ -3052,75 +3052,51 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>593</th>\n",
-       "      <td>PASSENGER RAIL</td>\n",
-       "      <td>CALIFORNIA HIGH SPEED RAIL AUTHORITY</td>\n",
-       "      <td>1TR1012</td>\n",
+       "      <th>812</th>\n",
+       "      <td>TRANSIT</td>\n",
+       "      <td>LOS ANGELES COUNTY MTA (METRO)</td>\n",
+       "      <td>1TL0703</td>\n",
        "      <td>0</td>\n",
-       "      <td>CALIFORNIA HIGH-\\nSPEED RAIL</td>\n",
-       "      <td>NaN</td>\n",
+       "      <td>METRO RAIL TRANSIT CAPITAL</td>\n",
+       "      <td>COUNTYWIDE</td>\n",
        "      <td>NaN</td>\n",
-       "      <td>CALIFORNIA HIGH-SPEED RAIL PHASE 1 - ENV/PE</td>\n",
-       "      <td>2021</td>\n",
-       "      <td>332000</td>\n",
+       "      <td>RAIL CAPITAL PROJECTS</td>\n",
+       "      <td>2040</td>\n",
+       "      <td>19151000</td>\n",
        "      <td>NaN</td>\n",
        "      <td>No Title</td>\n",
-       "      <td>332000000</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2941</th>\n",
-       "      <td>PASSENGER RAIL</td>\n",
-       "      <td>CHSRA</td>\n",
-       "      <td>7120010</td>\n",
-       "      <td>0</td>\n",
-       "      <td>CALIFORNIA HIGH-\\nSPEED RAIL</td>\n",
-       "      <td>REGIONWIDE</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>CALIFORNIA HIGH-SPEED RAIL - PHASE 1 (INCLUDES METROLINK AND\\nLOSSAN CORRIDOR SPEED UPGRADES)</td>\n",
-       "      <td>2033</td>\n",
-       "      <td>38960000</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>No Title</td>\n",
-       "      <td>38960000000</td>\n",
+       "      <td>19151000000</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "              system                           lead_agency   rtp_id route_#  \\\n",
-       "593   PASSENGER RAIL  CALIFORNIA HIGH SPEED RAIL AUTHORITY  1TR1012       0   \n",
-       "2941  PASSENGER RAIL                                 CHSRA  7120010       0   \n",
-       "\n",
-       "                        route_name        from   to  \\\n",
-       "593   CALIFORNIA HIGH-\\nSPEED RAIL         NaN  NaN   \n",
-       "2941  CALIFORNIA HIGH-\\nSPEED RAIL  REGIONWIDE  NaN   \n",
+       "      system                     lead_agency   rtp_id route_#  \\\n",
+       "812  TRANSIT  LOS ANGELES COUNTY MTA (METRO)  1TL0703       0   \n",
        "\n",
-       "                                                                                        description  \\\n",
-       "593                                                     CALIFORNIA HIGH-SPEED RAIL PHASE 1 - ENV/PE   \n",
-       "2941  CALIFORNIA HIGH-SPEED RAIL - PHASE 1 (INCLUDES METROLINK AND\\nLOSSAN CORRIDOR SPEED UPGRADES)   \n",
+       "                     route_name        from   to            description  \\\n",
+       "812  METRO RAIL TRANSIT CAPITAL  COUNTYWIDE  NaN  RAIL CAPITAL PROJECTS   \n",
        "\n",
-       "     completion_year project_cost__$1,000s_ county project_title  \\\n",
-       "593             2021                 332000    NaN      No Title   \n",
-       "2941            2033               38960000    NaN      No Title   \n",
+       "    completion_year project_cost__$1,000s_ county project_title  \\\n",
+       "812            2040               19151000    NaN      No Title   \n",
        "\n",
-       "     project_cost_millions  \n",
-       "593              332000000  \n",
-       "2941           38960000000  "
+       "    project_cost_millions  \n",
+       "812           19151000000  "
       ]
      },
-     "execution_count": 74,
+     "execution_count": 69,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "scag.loc[scag.description.str.contains(\"California High-Speed Rail\", case=False)]\n"
+    "scag.loc[scag.description.str.contains(\"Rail Capital Projects\", case=False)]\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 75,
+   "execution_count": 70,
    "id": "78be03f0-de29-48f2-a1d6-3506c57081d5",
    "metadata": {},
    "outputs": [
@@ -3130,7 +3106,7 @@
        "38960000000"
       ]
      },
-     "execution_count": 75,
+     "execution_count": 70,
      "metadata": {},
      "output_type": "execute_result"
     }
diff --git a/project_list/add_ctips_02_21_2024.ipynb b/project_list/add_ctips_02_21_2024.ipynb
new file mode 100644
index 000000000..97cc968de
--- /dev/null
+++ b/project_list/add_ctips_02_21_2024.ipynb
@@ -0,0 +1,2402 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "1e57c806",
+   "metadata": {},
+   "source": [
+    "## CTIPS\n",
+    "* https://ctips-prod.dot.ca.gov/ctips/LoginMediatorForm.do\n",
+    "\n",
+    "### To do\n",
+    "* Get total cost\n",
+    "    * total_cost: The total cost of this project.\n",
+    "    Total project cost can be calculated using 3 tables: project, fundtype, and fundline\n",
+    "    AH: which columns do I use from fundtype, project, and fundline to calculate the total cost?\n",
+    "    You can calculate total programmed for a project using: fundline.action = P and project.high_offlc = 1\n",
+    "    Then sum ( fundline.pe_paed + fundline.pe_env + fundline.pe_rw + fundline.pe_con + fundline.rw + fundline.con )\n",
+    "\n",
+    "* Ask if DSHOPP means draft SHOPP project\n",
+    "* PROJSCHE - not a lot of matches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "331fca6b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd \n",
+    "import sqlalchemy \n",
+    "import sys \n",
+    "import re\n",
+    "import oracledb \n",
+    "import _database_utils as _utils \n",
+    "import _csis_utils"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cdf61f63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "oracledb.version = \"8.3.0\" \n",
+    "sys.modules[\"cx_Oracle\"] = oracledb "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b510d7b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.options.display.max_columns = 400\n",
+    "pd.options.display.float_format = \"{:.2f}\".format\n",
+    "pd.set_option(\"display.max_rows\", None)\n",
+    "pd.set_option(\"display.max_colwidth\", None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "06b673b3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "ENGINE_PATH_WIN_AUTH =  f\"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}\" "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7525e19c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a6c14bd",
+   "metadata": {},
+   "source": [
+    "## Project Base Table\n",
+    "### Project\n",
+    "Project.agencyid = project sponsor\n",
+    "\n",
+    "Implpaed = Implementing Agency for PA&ED\n",
+    "\n",
+    "Implpse = Implementing Agency for PS&E\n",
+    "\n",
+    "implcon = Implementing Agency for Construction\n",
+    "\n",
+    "implrw = Implementing Agency for Right of Way\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "87510d9c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "ctips_id,\n",
+    "appdate, \n",
+    "archive,\n",
+    "agencyid,\n",
+    "const_date,\n",
+    "countyid,\n",
+    "countyid2,\n",
+    "countyid3,\n",
+    "chg_offcl,\n",
+    "chg_qual1,\n",
+    "chg_qual2,\n",
+    "districtid,\n",
+    "document,\n",
+    "docyear,\n",
+    "ea_number,\n",
+    "high_ver,\n",
+    "high_offcl,\n",
+    "implpaed, \n",
+    "implpse, \n",
+    "implrw, \n",
+    "implcon, \n",
+    "needpurpose,\n",
+    "progcode1,\n",
+    "ppno,\n",
+    "proj_desc,\n",
+    "postmiles1,\n",
+    "pm1b,\n",
+    "pm2b,\n",
+    "pm3b,\n",
+    "pm1a,\n",
+    "pm2a,\n",
+    "pm3a,\n",
+    "projcomp_date,\n",
+    "projectid,\n",
+    "route1,\n",
+    "route2,\n",
+    "route3,\n",
+    "rtl,\n",
+    "title,\n",
+    "version\n",
+    "FROM ctips.project\n",
+    "ORDER BY high_ver DESC, version DESC, high_offcl DESC\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6e6fb595",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_df.document.unique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1fc58d5e",
+   "metadata": {},
+   "source": [
+    "#### DOUBLE check filtering on document"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "de82322c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table = _csis_utils.csis_clean_project(projects_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "05f51a07",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.document.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "32ae2bab",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(projects_table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1a634f44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4adbcc27",
+   "metadata": {},
+   "source": [
+    "### PROJSCHE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "096e0ffd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projsche_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "projectid,\n",
+    "m020 AS pa_ed_begin,\n",
+    "m200a AS pa_ed_end,\n",
+    "m200b AS ps_e_begin,\n",
+    "m224 AS begin_row,\n",
+    "m410 AS end_row,\n",
+    "m500 AS con_start_date,\n",
+    "m600 AS con_end_date,\n",
+    "m700 AS begin_closeout,\n",
+    "m800 AS end_closeout\n",
+    "FROM ctips.projsche\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0c32c036",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projsche_drop_cols = list(projsche_df.columns)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a85af0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projsche_drop_cols.remove('projectid')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acb324e6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# I want to drop the rows in which ALL values in the date columns are empty\n",
+    "projsche_df2 = projsche_df.dropna(how = \"all\", subset = projsche_drop_cols).reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc6578cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(projsche_df2), len(projsche_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fef2ed85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projsche_df2.projectid.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c613a019",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projsche_df2.projectid.value_counts().head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "320571a7",
+   "metadata": {},
+   "source": [
+    "#### Not a lot of matching values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "311506d2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.merge(projsche_df2, projects_table, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c6def815",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phase_dates_df = pd.merge(projects_table[['ctips_id', 'projectid']], projsche_df2,  on ='projectid', how = 'inner')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68d8a80c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phase_dates_df.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe1013ec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phase_dates_df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6aee9af0",
+   "metadata": {},
+   "source": [
+    "### AGENCY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fda0b02a",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "agency_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "name AS agency_name,\n",
+    "agencyid\n",
+    "FROM ctips.agncy\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "68229e79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agency_df.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d3e86563",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table = pd.merge(projects_table, agency_df,  on ='agencyid', how = 'left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d6177b3d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phase_agency_cols = ['implpaed','implpse','implrw', 'implcon']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "197380b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in phase_agency_cols:\n",
+    "    projects_table = _csis_utils.add_agencies(projects_table, agency_df, i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34b5bffe",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f0675e4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table = projects_table.drop(columns = phase_agency_cols)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f39f398c",
+   "metadata": {},
+   "source": [
+    "### COUNTY"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "72df0bb3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "name AS county_name,\n",
+    "countyid\n",
+    "FROM ctips.county\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a3be2d02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df = pd.merge(projects_table[['ctips_id','countyid', 'countyid2', 'countyid3']], county_df,  on ='countyid', how = 'left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "43179a66",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "421e9af2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "countyid_cols =  ['countyid2', 'countyid3']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "74dfdb02",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "for i in countyid_cols:\n",
+    "    county_projects_df =  _csis_utils.add_counties(county_projects_df, county_df, i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ffaf0f1e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df = county_projects_df[['ctips_id', 'county_name', 'countyid2_county', 'countyid3_county']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e617a674",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a36adee5",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# Melt this from wide to long\n",
+    "county_projects_df2 = pd.melt(county_projects_df, id_vars=['ctips_id'], value_vars=['county_name','countyid2_county','countyid3_county'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1597b8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df2.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4311b58e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df2 = county_projects_df2.sort_values(by = ['ctips_id']).dropna().reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adc68256",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df2 = county_projects_df2.drop(columns = ['variable']).rename(columns = {'value':'county'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb00e398",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df2.ctips_id.value_counts().describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa7b2579",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df2.loc[county_projects_df2.ctips_id == 10600002937]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2f1d8ea6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table = projects_table.drop(columns = countyid_cols)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "87d196cc",
+   "metadata": {},
+   "source": [
+    "### FUNDLINE\n",
+    "* For action: Action: P = programmed, V= vote, A=award"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7fc7047a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fundline_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "    action,\n",
+    "    con,\n",
+    "    rw,\n",
+    "    pe_paed,\n",
+    "    pe_env,\n",
+    "    pe_rw,\n",
+    "    pe_con,\n",
+    "    pe_total,\n",
+    "    fundlineid,\n",
+    "    fundtypeid,\n",
+    "    line_year,\n",
+    "    actiondate\n",
+    "FROM ctips.fundline\n",
+    "WHERE action = 'P'\n",
+    "\"\"\", engine)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cc6ac7b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fundline_df.action.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d38f6532",
+   "metadata": {},
+   "source": [
+    "### Fundtype\n",
+    "* Fundtype.agencyid = funding agency"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "03ad06ed",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fundtype_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "fundtypeid,\n",
+    "fundid,\n",
+    "progcode,\n",
+    "programid,\n",
+    "projectid,\n",
+    "agencyid\n",
+    "FROM ctips.fundtype\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "471153ad",
+   "metadata": {},
+   "source": [
+    "#### Merge everything"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2242bbf2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.merge(fundtype_df,\n",
+    "         fundline_df,  \n",
+    "         on = ['fundtypeid'], \n",
+    "         how = \"outer\",\n",
+    "         indicator = True,)[['_merge']].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "56d38a60",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.merge(fundtype_df,fundline_df,  on = ['fundtypeid'], how = \"outer\", indicator = True)[['_merge']].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc6f24e1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fund_m1 = pd.merge(fundtype_df,fundline_df,  on = ['fundtypeid'], how = \"left\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8939f75b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_fund_m = pd.merge(projects_table[['projectid','ctips_id', 'document']], fund_m1, on = ['projectid'], how = \"inner\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51fa4c0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_fund_m.ctips_id.nunique(), len(final_fund_m)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a2301d5a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "42f5bd20",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "29152-29116"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0722f47b",
+   "metadata": {},
+   "source": [
+    "### Progmain"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f612c3ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "progmain_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "programid,\n",
+    "category AS program\n",
+    "FROM ctips.progmain\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dae62ff2",
+   "metadata": {},
+   "source": [
+    "### Fund"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "abeb1cf9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fund_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "fund,\n",
+    "fundid,\n",
+    "type AS fund_type_1_fed_2_state_3_local\n",
+    "FROM ctips.fund\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ae1f680e",
+   "metadata": {},
+   "source": [
+    "### Progsub\n",
+    "* Some progcodes have more than one progdesc\n",
+    "* Dropped duplicates bc the progdesc are similar\n",
+    "double_ids = ['20.30.010.820',\n",
+    "             '20.XX.723.000',\n",
+    "            '20.30.010.810',\n",
+    "             '20.XX.720.100',\n",
+    "             '20.30.010.817',\n",
+    "              '20.30.210.200'\n",
+    "             ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2f0bcec",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "progsub_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "progcode,\n",
+    "progdesc\n",
+    "FROM ctips.progsub\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88794e92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "progsub_df2 = progsub_df.drop_duplicates(subset = ['progcode'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "236a5e98",
+   "metadata": {},
+   "source": [
+    "### Merge for work below"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e438665d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_fund_m.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d7341c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info = ((final_fund_m\n",
+    "                           .merge(progmain_df, on = ['programid'], how = \"left\")\n",
+    "                           .merge(fund_df, on =['fundid'], how = \"left\")\n",
+    "                           .merge(progsub_df2, on = ['progcode'], how = \"left\"))\n",
+    "                           .drop(columns = ['fundid','programid', 'progcode']))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e8efa77b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info.fund.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fe4ec8c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info['fund'] = funding_w_program_info.fund + '-' + funding_w_program_info.fund_type_1_fed_2_state_3_local.astype('str')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "69499228",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info = funding_w_program_info.fillna(funding_w_program_info.dtypes.replace({'float64': 0.0, 'object': 'None'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "29d0d6f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0311f5c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c6ae432",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info.action.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b203d580",
+   "metadata": {},
+   "source": [
+    "#### Filter out programmed temporarily for now."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea7a292d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info = funding_w_program_info.loc[funding_w_program_info.action == \"P\"].reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "0920e3c2",
+   "metadata": {},
+   "source": [
+    "## Phase Funding Table"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "03e10f95",
+   "metadata": {},
+   "source": [
+    "#### First: find the # of funds a project has programmed/voted/awarded for each fund"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b4bde774",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "columns_to_agg = {**dict.fromkeys(['con', 'rw',\n",
+    "       'pe_paed', 'pe_env', 'pe_rw', 'pe_con', 'pe_total'], 'sum')}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "531ee9f5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "columns_to_agg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "11b5f191",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# I want to find the total funds a project will receive for each fund\n",
+    "total_cost = funding_w_program_info.groupby(['ctips_id','fund','document']).agg(columns_to_agg).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "17396ff5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_cost.sample()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f6059d2f",
+   "metadata": {},
+   "source": [
+    "##### Separate out FTIP and everything else to calculate total funds a project is estimated to receive"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c2c6638f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Calculate out FTIP and oither documents in 2 stages\n",
+    "ftip_only = total_cost.loc[total_cost.document.isin(['FTIP','DFTIP' ])].reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "48c83f88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_cost_ftip = ftip_only.groupby(['ctips_id', 'fund']).agg(columns_to_agg).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8bd716d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cols_to_keep = ['ctips_id', 'fund', 'total_cost']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e613cf0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_cost_ftip['total_cost'] = total_cost_ftip.con + total_cost_ftip.rw + total_cost_ftip.pe_total"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f74a386e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_cost_ftip = total_cost_ftip[cols_to_keep]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1118b944",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "everything_else = total_cost.loc[~total_cost.document.isin(['FTIP','DFTIP'])].reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3070d228",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "everything_else = everything_else.groupby(['ctips_id', 'fund']).agg(columns_to_agg).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d5ad472a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "everything_else['total_cost'] = everything_else.con + everything_else.rw + everything_else.pe_paed + everything_else.pe_env + everything_else.pe_rw + everything_else.pe_con"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e24f20fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "everything_else = everything_else[cols_to_keep]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f442ffe1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_requested_funds_final = pd.concat([everything_else, total_cost_ftip])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9c2d01f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(total_requested_funds_final), total_requested_funds_final.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f48aed8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_requested_funds_final.ctips_id.value_counts().describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cd9c7213",
+   "metadata": {},
+   "source": [
+    "##### One project"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "36c4d445",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "total_requested_funds_final.loc[total_requested_funds_final.ctips_id == 20300000209]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f4c616bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "8900.00+63400.00"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "aa2dbaff",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "9700.00+1100.00"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e7b5497",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Original \n",
+    "funding_w_program_info.loc[funding_w_program_info.ctips_id == 20300000209]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6921ff42",
+   "metadata": {},
+   "source": [
+    "#### Pivot - I want the dataframe to be wide instead of long"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e73a815",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fund_table = total_requested_funds_final.pivot_table(index=['ctips_id'], columns='fund', \n",
+    "                    values=['total_cost'], aggfunc='sum')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "369e76a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fund_table.columns = fund_table.columns.droplevel()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8da306ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fund_table = fund_table.reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "089794b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fund_table.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0bba2cfa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fund_table = _utils.to_snakecase(fund_table)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dd91c841",
+   "metadata": {},
+   "source": [
+    "#### Second: find the amount of $ for each phase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "08582661",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cost_per_phase = funding_w_program_info.groupby(['ctips_id']).agg(columns_to_agg).reset_index()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b362f3c0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cost_per_phase.shape, cost_per_phase.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "439f1946",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "63400.00+9700.00+11300"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e885972",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "fund_table.loc[fund_table.ctips_id == 20300000209].dropna(axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b1c458ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cost_per_phase.loc[cost_per_phase.ctips_id == 20300000209].dropna(axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08564b97",
+   "metadata": {},
+   "source": [
+    "#### Third: merge these 2 tables"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "41e08290",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table = pd.merge(fund_table, cost_per_phase, on = [\"ctips_id\"], how = \"inner\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "79f2f2f2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(final_phase_funding_table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2c33f3ae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table.head(2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "97ad053f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table.ctips_id.nunique(), len(final_phase_funding_table)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a824983c",
+   "metadata": {},
+   "source": [
+    "#### Fourth: find state v federal \n",
+    "* State funds is a lot more."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b3471a77",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "federal_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['1.0'], 'total_federal_funds')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4b164f51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "state_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['2.0'], 'total_state_funds')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "356faf47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "local_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['3.0'], 'total_local_funds')\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1c0975b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table[\"is_state\"] = final_phase_funding_table.apply(_utils.is_state_funds, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14e14a4c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table[\"is_federal\"] = final_phase_funding_table.apply(_utils.is_fed_funds, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a125f0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table[\"is_local\"] = final_phase_funding_table.apply(_utils.is_local_funds, axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f83111de",
+   "metadata": {},
+   "source": [
+    "##### Check that I summed up federal funds correctly\n",
+    "* State not summing up correctly"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f9609041",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info.loc[funding_w_program_info.ctips_id == 20920011849][['fund_type_1_fed_2_state_3_local']].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7134d80d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['con']].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "421b4f27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['rw']].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c24b6f55",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['pe_total']].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6671324b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "112386000.00+12213000.00+4351000.00 == 128950000.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "14d0f887",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table.loc[final_phase_funding_table.ctips_id == 20920011849].dropna(axis=1).T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9a1bbfda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "(final_phase_funding_table\n",
+    " .groupby(['is_state', 'is_federal', 'is_local'])\n",
+    " .agg({'ctips_id':'nunique'})\n",
+    " .reset_index()\n",
+    " .sort_values(by = ['ctips_id']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ded58f7d",
+   "metadata": {},
+   "source": [
+    "#### Fifth: Drop everything before `con`\n",
+    "* Need to differentiate between `pe_total` for FTIP vs `pe_total` for everything else."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e10c8f52",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "to_keep = ['ctips_id','con','rw', 'pe_env', 'pe_rw', 'pe_con', 'pe_total', 'total_federal_funds',\n",
+    "       'total_state_funds', 'total_local_funds', 'is_local', 'is_state',\n",
+    "       'is_federal']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7173842",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table2 = final_phase_funding_table[to_keep]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ea702394",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table2.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d5a5f2bb",
+   "metadata": {},
+   "source": [
+    "#### Sixth: Merge on `phase_dates_df` with all the phase dates"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6025a9f1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "phase_dates_df.ctips_id.nunique(), len(phase_dates_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cafa19d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.ctips_id.nunique(), len(projects_table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f7d0efa3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table.ctips_id.nunique(), len(final_phase_funding_table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "77a24a8c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table2 = pd.merge(final_phase_funding_table2, phase_dates_df, on = \"ctips_id\", how = \"outer\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "023f3588",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.merge(final_phase_funding_table2, phase_dates_df, on = \"ctips_id\", how = \"outer\", indicator = True)[['_merge']].value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3fc198d0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table2.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b6c2a48f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a7dc1126",
+   "metadata": {},
+   "source": [
+    "#### Seventh: Merge some other dates found in the `projects` dataframe."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "94edbad1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project_date_cols = ['const_date', 'rtl', 'ctips_id', 'projcomp_date']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e2fe592",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_dates = projects_table[project_date_cols]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "73a1417a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "project_date_cols.remove('ctips_id')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e4e98bd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table = projects_table.drop(columns = project_date_cols)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "57b81fba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# I'm only interested in rwos in which at least one of the dates are populated\n",
+    "projects_dates2 = projects_dates.loc[(projects_dates.rtl != 'datetime64[ns]') |  (projects_dates.const_date != 'datetime64[ns]')].reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "818abaad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_dates2 = projects_dates2.rename(columns = {'const_date': 'construction_completion_date', 'rtl':'ready_to_list_date'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "10943300",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table3 = pd.merge(final_phase_funding_table2, projects_dates2, on = 'ctips_id', how = 'left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e0b5c9fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table3.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5002bf51",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4f590b0e",
+   "metadata": {},
+   "source": [
+    "### Awards Table\n",
+    "* Take final_fund_m and sort it by year\n",
+    "* Line year is \"fiscal year of this fund record Note that the year listed is the second in the pair of fiscal year notation.  For example if the funds for this record are for fiscal year 1998/99, then this record will hold the value 1999.\"\n",
+    "* These aren't really programs, funds?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "52bfb5ea",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# del out '',\n",
+    "awards = funding_w_program_info[['ctips_id','line_year', 'program', 'progdesc']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbcb89fb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Just drop dups across\n",
+    "awards2 = awards.drop_duplicates().reset_index(drop = True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2fe1729",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(awards), len(awards2)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "306fde98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "awards3 = awards2.sort_values(by = ['ctips_id','program','line_year', ], ascending = [False, False, False])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef54634e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "awards4 = awards3.drop_duplicates(subset = ['ctips_id','program'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "84467f0d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "awards4.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "de8480c4",
+   "metadata": {},
+   "source": [
+    "#### Check w/ one project"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6c4766bd",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "# awards3.loc[awards3.ctips_id == 20700001649]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8048727d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "awards4.loc[awards4.ctips_id == 20700001649]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f0c78a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "awards4.ctips_id.value_counts().describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bd4cacf9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "awards4.ctips_id.value_counts().head(10)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "33ec46a8",
+   "metadata": {},
+   "source": [
+    "### Political"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d256a277",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "political_df = pd.read_sql_query(\"\"\" \n",
+    "SELECT \n",
+    "*\n",
+    "FROM ctips.politcal\n",
+    "\"\"\", engine) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "95616eb2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop any rows with nulls\n",
+    "# There are a bunch of rows that are 0 \n",
+    "political_df_without_na = political_df.dropna(how = \"any\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3ede35ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(political_df), political_df.projectid.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3d0b0790",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "political_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bf56a9c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(political_df_without_na)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef597573",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "political_df_without_na.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "49ccb448",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "political_df2 = pd.merge(projects_table[['ctips_id', 'projectid']], political_df_without_na, on ='projectid', how = 'inner')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ee357e4e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "political_df2.shape, political_df2.projectid.nunique(), political_df2.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "82c8e22c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assembly_df = _csis_utils.clean_political(political_df2, 'assembly')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3e34803a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assembly_df.ctips_id.value_counts().head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1d8586e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(assembly_df), assembly_df.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4e36025e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "senate_df = _csis_utils.clean_political(political_df2, 'ssenate')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0d02b8b0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(senate_df), senate_df.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "37cfedd3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ushouse_df = _csis_utils.clean_political(political_df2, 'ushouse')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "adbf20b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "len(ushouse_df), ushouse_df.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "192df3c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ushouse_df.ushouse.value_counts().head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ba49612f",
+   "metadata": {},
+   "source": [
+    "#### Double check"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "58fbabd0",
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "assembly_df.loc[assembly_df.ctips_id == 10900000289]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2940ed0f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "political_df2.loc[political_df2.ctips_id == 10900000289]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "638d08e0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# projects_table.loc[projects_table.ctips_id == 10900000289]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c2b6fa42",
+   "metadata": {},
+   "source": [
+    "## Save to Excel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d08d935d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "district_df = projects_table[['ctips_id','districtid']].drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c9655708",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "district_df.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "31352c30",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# projects_table = projects_table.fillna(projects_table.dtypes.replace({'float64': 0.0, 'object': 'None'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "622ac980",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "district_df = district_df.fillna(district_df.dtypes.replace({'float64': 0.0, 'object': 'None'})).drop_duplicates()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "88b9aa82",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table3 = final_phase_funding_table3.fillna(district_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dcc066c2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "awards4 = awards4.fillna(awards4.dtypes.replace({'float64': 0.0, 'object': 'None'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e9967490",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ushouse_df = ushouse_df.fillna(ushouse_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a846c82d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "senate_df = senate_df.fillna(senate_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6bd77a85",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assembly_df = assembly_df.fillna(assembly_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb81f83e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assembly_df.ctips_id.nunique(), awards4.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e26e7701",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table3.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb1a3085",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df2.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6ed4aa81",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "district_df.ctips_id.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f8d433cc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# drop_cols = ['chg_offcl', 'chg_qual1', 'chg_qual2','districtid', 'appdate', 'version','projcomp_date', 'agencyid', 'projectid', 'archive', 'agency_name']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f3552a50",
+   "metadata": {},
+   "source": [
+    "#### ASK WHY SOME PROJECTS have 15 which means the project is done? But archive is 0?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dde3c7b4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.archive.unique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a804c48f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table.chg_qual1.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5b8cbb62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "proj_cols_drop = ['appdate', 'archive',\n",
+    "       'high_ver', 'high_offcl', 'progcode1',\n",
+    "       'agencyid', 'projectid',\n",
+    "       'version', 'countyid']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cacb064a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agency_cols = ['agency_name', 'implpaed_agency',\n",
+    "       'implpse_agency', 'implrw_agency', 'implcon_agency']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b7fe0d71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agency_df = projects_table[agency_cols + ['ctips_id']]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5ca7649f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "agency_df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "34b5ca5c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_table = projects_table.drop(columns = proj_cols_drop + agency_cols)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bc2a413",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "district_df.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "18d7f4d8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "county_projects_df2.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b717d091",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "final_phase_funding_table3.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "76719372",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "awards4.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2b8f5328",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ushouse_df.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "c68207d5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "senate_df.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fb0745dd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assembly_df.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc7fed4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "with pd.ExcelWriter(\"CTIPS_data.xlsx\") as writer:\n",
+    "    projects_table.to_excel(writer, sheet_name=\"project\", index=False)\n",
+    "    agency_df.to_excel(writer, sheet_name=\"agencies\", index=False)\n",
+    "    district_df.to_excel(writer, sheet_name=\"district\", index=False)\n",
+    "    county_projects_df2.to_excel(writer, sheet_name=\"county\", index=False)\n",
+    "    final_phase_funding_table3.to_excel(writer, sheet_name=\"phase_funding\", index=False)\n",
+    "    awards4.to_excel(writer, sheet_name=\"awards\", index=False)\n",
+    "    ushouse_df.to_excel(writer, sheet_name=\"us_house\", index=False)\n",
+    "    senate_df.to_excel(writer, sheet_name=\"senate\", index=False)\n",
+    "    assembly_df.to_excel(writer, sheet_name=\"assembly\", index=False)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/project_list/add_lp2000_10_27_2023.ipynb b/project_list/add_lp2000_01_24_2024.ipynb
similarity index 51%
rename from project_list/add_lp2000_10_27_2023.ipynb
rename to project_list/add_lp2000_01_24_2024.ipynb
index d957c0aa6..b2a8059cc 100644
--- a/project_list/add_lp2000_10_27_2023.ipynb
+++ b/project_list/add_lp2000_01_24_2024.ipynb
@@ -2,8 +2,8 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 143,
-   "id": "690f3834",
+   "execution_count": 1,
+   "id": "b7e8790f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -11,13 +11,14 @@
     "import sqlalchemy \n",
     "import sys \n",
     "import re\n",
-    "import oracledb "
+    "import oracledb \n",
+    "import _database_utils as _utils "
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
-   "id": "bc9c8556",
+   "execution_count": 2,
+   "id": "b9f4a73c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -27,8 +28,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
-   "id": "0d7cc5e7",
+   "execution_count": 3,
+   "id": "e06681e8",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -40,8 +41,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
-   "id": "150ac181",
+   "execution_count": 5,
+   "id": "a1bf2f34",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -50,8 +51,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 148,
-   "id": "bdf25f1a",
+   "execution_count": 6,
+   "id": "053f212c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -61,29 +62,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 149,
-   "id": "5595350c",
+   "execution_count": 7,
+   "id": "f446f7a3",
    "metadata": {},
    "outputs": [],
    "source": [
     "engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   "
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 150,
-   "id": "14389690",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def to_snakecase(df):\n",
-    "    df.columns = df.columns.str.lower().str.replace(' ','_')\n",
-    "    return df"
-   ]
-  },
   {
    "cell_type": "markdown",
-   "id": "44b09522",
+   "id": "5e5e7b24",
    "metadata": {},
    "source": [
     "## Projects\n",
@@ -92,8 +81,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
-   "id": "c9bb15d4",
+   "execution_count": 8,
+   "id": "254d963f",
    "metadata": {
     "scrolled": true
    },
@@ -134,8 +123,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
-   "id": "9c6a6136",
+   "execution_count": 9,
+   "id": "7138a0f2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -144,8 +133,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
-   "id": "ca5a8ca7",
+   "execution_count": 10,
+   "id": "bf77a59d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -154,8 +143,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 154,
-   "id": "b88da8ed",
+   "execution_count": 11,
+   "id": "6ef5f157",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -164,8 +153,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 155,
-   "id": "3f0361ad",
+   "execution_count": 12,
+   "id": "6dd37cbb",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -174,8 +163,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 285,
-   "id": "3a8ca472",
+   "execution_count": 13,
+   "id": "a91e2bf9",
    "metadata": {},
    "outputs": [
     {
@@ -183,32 +172,32 @@
      "output_type": "stream",
      "text": [
       "<class 'pandas.core.frame.DataFrame'>\n",
-      "RangeIndex: 11768 entries, 0 to 11767\n",
+      "RangeIndex: 11272 entries, 0 to 11271\n",
       "Data columns (total 20 columns):\n",
       " #   Column                       Non-Null Count  Dtype         \n",
       "---  ------                       --------------  -----         \n",
-      " 0   project_id                   11768 non-null  object        \n",
-      " 1   comment_desc                 10892 non-null  object        \n",
-      " 2   district_code                11767 non-null  object        \n",
-      " 3   est_total_prj_costs          1083 non-null   float64       \n",
-      " 4   location_name                11401 non-null  object        \n",
-      " 5   project_label_name           10906 non-null  object        \n",
-      " 6   original_post_mile_begin_id  750 non-null    float64       \n",
-      " 7   original_post_mile_end_id    576 non-null    float64       \n",
+      " 0   project_id                   11272 non-null  object        \n",
+      " 1   comment_desc                 10399 non-null  object        \n",
+      " 2   district_code                11271 non-null  object        \n",
+      " 3   est_total_prj_costs          1329 non-null   float64       \n",
+      " 4   location_name                10906 non-null  object        \n",
+      " 5   project_label_name           10414 non-null  object        \n",
+      " 6   original_post_mile_begin_id  734 non-null    float64       \n",
+      " 7   original_post_mile_end_id    570 non-null    float64       \n",
       " 8   revised_post_mile_begin_ind  20 non-null     object        \n",
       " 9   revised_post_mile_end_ind    15 non-null     object        \n",
-      " 10  route_name                   11447 non-null  object        \n",
-      " 11  state_hwy_ind                11405 non-null  object        \n",
+      " 10  route_name                   10950 non-null  object        \n",
+      " 11  state_hwy_ind                10909 non-null  object        \n",
       " 12  senate_district_code         0 non-null      object        \n",
-      " 13  update_date_time             11699 non-null  datetime64[ns]\n",
-      " 14  agency_name                  11767 non-null  object        \n",
-      " 15  urban_area_code              5826 non-null   object        \n",
-      " 16  county_name                  11763 non-null  object        \n",
-      " 17  work_type_desc               2887 non-null   object        \n",
-      " 18  category_desc                10055 non-null  object        \n",
-      " 19  current_phase                11768 non-null  object        \n",
+      " 13  update_date_time             11203 non-null  datetime64[ns]\n",
+      " 14  agency_name                  11271 non-null  object        \n",
+      " 15  urban_area_code              5605 non-null   object        \n",
+      " 16  county_name                  11267 non-null  object        \n",
+      " 17  work_type_desc               2711 non-null   object        \n",
+      " 18  category_desc                9581 non-null   object        \n",
+      " 19  current_phase                11272 non-null  object        \n",
       "dtypes: datetime64[ns](1), float64(3), object(16)\n",
-      "memory usage: 1.8+ MB\n"
+      "memory usage: 1.7+ MB\n"
      ]
     }
    ],
@@ -218,7 +207,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e0fe254b",
+   "id": "948e473b",
    "metadata": {},
    "source": [
     "## EA Number\n",
@@ -228,8 +217,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 156,
-   "id": "fb96a824",
+   "execution_count": 14,
+   "id": "c78a9816",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -245,8 +234,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 157,
-   "id": "44018131",
+   "execution_count": 15,
+   "id": "3f67c483",
    "metadata": {},
    "outputs": [
     {
@@ -255,7 +244,7 @@
        "((49431, 4), 24130)"
       ]
      },
-     "execution_count": 157,
+     "execution_count": 15,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -266,8 +255,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 158,
-   "id": "eee7672c",
+   "execution_count": 16,
+   "id": "07a9f913",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -277,8 +266,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 159,
-   "id": "8b7a6f5b",
+   "execution_count": 17,
+   "id": "342ee6c4",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -287,8 +276,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 160,
-   "id": "8ca731d9",
+   "execution_count": 18,
+   "id": "b50c43f3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -298,8 +287,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 161,
-   "id": "b2cb6d0c",
+   "execution_count": 19,
+   "id": "8f43633c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -309,8 +298,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 162,
-   "id": "8f6816c6",
+   "execution_count": 20,
+   "id": "6c6b17aa",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -320,8 +309,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 163,
-   "id": "638c4350",
+   "execution_count": 21,
+   "id": "6efbe740",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -331,17 +320,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 164,
-   "id": "0547982c",
+   "execution_count": 22,
+   "id": "10aaabe1",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "3030"
+       "2961"
       ]
      },
-     "execution_count": 164,
+     "execution_count": 22,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -352,16 +341,16 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 165,
-   "id": "5c57055d",
+   "execution_count": 23,
+   "id": "98e5481f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "count   2944.00\n",
+       "count   2880.00\n",
        "mean       1.03\n",
-       "std        0.18\n",
+       "std        0.17\n",
        "min        1.00\n",
        "25%        1.00\n",
        "50%        1.00\n",
@@ -370,7 +359,7 @@
        "Name: project_id, dtype: float64"
       ]
      },
-     "execution_count": 165,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -381,17 +370,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 166,
-   "id": "f245bf36",
+   "execution_count": 24,
+   "id": "8ae90cf5",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "2944"
+       "2880"
       ]
      },
-     "execution_count": 166,
+     "execution_count": 24,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -402,22 +391,22 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 167,
-   "id": "a03488a6",
+   "execution_count": 25,
+   "id": "ae8a5b68",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "5008(072)    3\n",
+       "5953(536)    3\n",
        "5932(042)    3\n",
        "5006(504)    3\n",
-       "5006(635)    3\n",
-       "5953(536)    3\n",
+       "5435(010)    2\n",
        "Name: project_id, dtype: int64"
       ]
      },
-     "execution_count": 167,
+     "execution_count": 25,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -428,8 +417,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 168,
-   "id": "2a24acdd",
+   "execution_count": 26,
+   "id": "d8e0c58a",
    "metadata": {},
    "outputs": [
     {
@@ -460,39 +449,17 @@
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2381</th>\n",
-       "      <td>07</td>\n",
-       "      <td>4S6608</td>\n",
-       "      <td>5006(635)</td>\n",
-       "      <td>2009-09-10 13:58:44</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2382</th>\n",
-       "      <td>07</td>\n",
-       "      <td>933575</td>\n",
-       "      <td>5006(635)</td>\n",
-       "      <td>2009-07-02 14:46:18</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2383</th>\n",
-       "      <td>07</td>\n",
-       "      <td>4U4414</td>\n",
-       "      <td>5006(635)</td>\n",
-       "      <td>2009-09-10 13:56:35</td>\n",
-       "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "     district_code expense_authorization_id project_id      ea_assign_date\n",
-       "2381            07                   4S6608  5006(635) 2009-09-10 13:58:44\n",
-       "2382            07                   933575  5006(635) 2009-07-02 14:46:18\n",
-       "2383            07                   4U4414  5006(635) 2009-09-10 13:56:35"
+       "Empty DataFrame\n",
+       "Columns: [district_code, expense_authorization_id, project_id, ea_assign_date]\n",
+       "Index: []"
       ]
      },
-     "execution_count": 168,
+     "execution_count": 26,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -503,8 +470,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 169,
-   "id": "682ff6e0",
+   "execution_count": 27,
+   "id": "6d002cda",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -520,17 +487,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 170,
-   "id": "c9bfb0a4",
+   "execution_count": 28,
+   "id": "df8f4a43",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "2944"
+       "2880"
       ]
      },
-     "execution_count": 170,
+     "execution_count": 28,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -541,8 +508,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 171,
-   "id": "c6993849",
+   "execution_count": 29,
+   "id": "4a81759a",
    "metadata": {},
    "outputs": [
     {
@@ -615,7 +582,7 @@
        "4            04                   985979  5178(016)"
       ]
      },
-     "execution_count": 171,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -626,17 +593,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 172,
-   "id": "f9eff2f3",
+   "execution_count": 30,
+   "id": "28a7a7e1",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "2873"
+       "2814"
       ]
      },
-     "execution_count": 172,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -647,7 +614,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "19049968",
+   "id": "6f73b4a0",
    "metadata": {},
    "source": [
     "#### The same EA number matches to multiple projects\n",
@@ -657,8 +624,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 286,
-   "id": "503db9e1",
+   "execution_count": 31,
+   "id": "6575e623",
    "metadata": {},
    "outputs": [
     {
@@ -690,21 +657,21 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>127</th>\n",
+       "      <th>120</th>\n",
        "      <td>01</td>\n",
        "      <td>924969</td>\n",
        "      <td>5904(114)</td>\n",
        "      <td>2011-02-28 10:37:39</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>404</th>\n",
+       "      <th>390</th>\n",
        "      <td>03</td>\n",
        "      <td>924969</td>\n",
        "      <td>5238(018)</td>\n",
        "      <td>1998-06-04 00:00:00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1428</th>\n",
+       "      <th>1396</th>\n",
        "      <td>08</td>\n",
        "      <td>924969</td>\n",
        "      <td>NBIL(502)</td>\n",
@@ -716,12 +683,12 @@
       ],
       "text/plain": [
        "     district_code expense_authorization_id project_id      ea_assign_date\n",
-       "127             01                   924969  5904(114) 2011-02-28 10:37:39\n",
-       "404             03                   924969  5238(018) 1998-06-04 00:00:00\n",
-       "1428            08                   924969  NBIL(502) 2006-06-23 16:18:52"
+       "120             01                   924969  5904(114) 2011-02-28 10:37:39\n",
+       "390             03                   924969  5238(018) 1998-06-04 00:00:00\n",
+       "1396            08                   924969  NBIL(502) 2006-06-23 16:18:52"
       ]
      },
-     "execution_count": 286,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -732,8 +699,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 287,
-   "id": "0582d2a5",
+   "execution_count": 32,
+   "id": "3ebc8d00",
    "metadata": {},
    "outputs": [
     {
@@ -765,14 +732,14 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>424</th>\n",
+       "      <th>409</th>\n",
        "      <td>08</td>\n",
        "      <td>924360</td>\n",
        "      <td>0061(025)</td>\n",
        "      <td>1998-12-23 00:00:00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1248</th>\n",
+       "      <th>1218</th>\n",
        "      <td>04</td>\n",
        "      <td>924360</td>\n",
        "      <td>6003(030)</td>\n",
@@ -784,11 +751,11 @@
       ],
       "text/plain": [
        "     district_code expense_authorization_id project_id      ea_assign_date\n",
-       "424             08                   924360  0061(025) 1998-12-23 00:00:00\n",
-       "1248            04                   924360  6003(030) 2005-05-17 15:25:28"
+       "409             08                   924360  0061(025) 1998-12-23 00:00:00\n",
+       "1218            04                   924360  6003(030) 2005-05-17 15:25:28"
       ]
      },
-     "execution_count": 287,
+     "execution_count": 32,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -799,8 +766,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 175,
-   "id": "e58e2bc7",
+   "execution_count": 33,
+   "id": "8c407170",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -809,8 +776,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 176,
-   "id": "36be128e",
+   "execution_count": 34,
+   "id": "d5982845",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -819,7 +786,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "f22f3bd9",
+   "id": "b41a3e92",
    "metadata": {},
    "source": [
     "* Shares the EA of 924360"
@@ -827,8 +794,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 177,
-   "id": "c4327e30",
+   "execution_count": 35,
+   "id": "00e0889b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -837,7 +804,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "7e25fe9c",
+   "id": "cbd3c992",
    "metadata": {},
    "source": [
     "## EFIS_MV_BUD_STRU_94_LVL_3_VW\n",
@@ -848,8 +815,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 178,
-   "id": "f9f06cfa",
+   "execution_count": 36,
+   "id": "adeb37a5",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -869,17 +836,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 179,
-   "id": "dfb521d0",
+   "execution_count": 37,
+   "id": "664c144a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(45227, 7)"
+       "(45666, 7)"
       ]
      },
-     "execution_count": 179,
+     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -890,17 +857,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 180,
-   "id": "52422939",
+   "execution_count": 38,
+   "id": "1cc8c5d9",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "19552"
+       "19821"
       ]
      },
-     "execution_count": 180,
+     "execution_count": 38,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -911,7 +878,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e8befd89",
+   "id": "2a74da37",
    "metadata": {},
    "source": [
     "### Efis Join\n",
@@ -920,8 +887,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 181,
-   "id": "e046b88c",
+   "execution_count": 39,
+   "id": "972329d2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -936,7 +903,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "d1c4df99",
+   "id": "3c566ba3",
    "metadata": {},
    "source": [
     "* Exclude project status because it's just the financial status of the project, not construction or whatever."
@@ -944,8 +911,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 182,
-   "id": "a7232b09",
+   "execution_count": 40,
+   "id": "c936398b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -958,8 +925,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 183,
-   "id": "c1a9bf0e",
+   "execution_count": 41,
+   "id": "a39f1bad",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -968,8 +935,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 184,
-   "id": "e811b5e2",
+   "execution_count": 42,
+   "id": "e3c9315e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -979,8 +946,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 185,
-   "id": "9c7336f8",
+   "execution_count": 43,
+   "id": "c818486a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -990,8 +957,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 186,
-   "id": "4d42c325",
+   "execution_count": 44,
+   "id": "918cc1c9",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1002,21 +969,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 187,
-   "id": "0f8623fb",
+   "execution_count": 45,
+   "id": "13a41288",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "_merge    \n",
-       "left_only     34284\n",
-       "both          10958\n",
-       "right_only       85\n",
+       "left_only     34832\n",
+       "both          10849\n",
+       "right_only       83\n",
        "dtype: int64"
       ]
      },
-     "execution_count": 187,
+     "execution_count": 45,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1027,8 +994,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 188,
-   "id": "8a6cde72",
+   "execution_count": 46,
+   "id": "bd3a6659",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1037,7 +1004,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "35caae64",
+   "id": "1c85c416",
    "metadata": {},
    "source": [
     "* 77 project ids missing after inner join."
@@ -1045,17 +1012,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 189,
-   "id": "afceb420",
+   "execution_count": 47,
+   "id": "7b2672b6",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "4788"
+       "4810"
       ]
      },
-     "execution_count": 189,
+     "execution_count": 47,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1066,17 +1033,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 190,
-   "id": "f89ecbbd",
+   "execution_count": 48,
+   "id": "a5b6529d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "4718"
+       "4742"
       ]
      },
-     "execution_count": 190,
+     "execution_count": 48,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1087,7 +1054,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "6f2a4ece",
+   "id": "bfb5e764",
    "metadata": {},
    "source": [
     "## Subset only  for the relevant project_ids from `Projects`\n",
@@ -1096,8 +1063,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 191,
-   "id": "f11e6fc4",
+   "execution_count": 49,
+   "id": "0c99473a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1106,17 +1073,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 192,
-   "id": "83b1c540",
+   "execution_count": 50,
+   "id": "0be547fb",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(11768, (11768, 1))"
+       "(11272, (11272, 1))"
       ]
      },
-     "execution_count": 192,
+     "execution_count": 50,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1128,17 +1095,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 193,
-   "id": "983ce05c",
+   "execution_count": 51,
+   "id": "98a8470f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(11768, 1)"
+       "(11272, 1)"
       ]
      },
-     "execution_count": 193,
+     "execution_count": 51,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1149,21 +1116,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 194,
-   "id": "a2cf22ad",
+   "execution_count": 52,
+   "id": "abf5aa6c",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "_merge    \n",
-       "both          10289\n",
-       "right_only     7463\n",
-       "left_only       669\n",
+       "both          10186\n",
+       "right_only     6942\n",
+       "left_only       663\n",
        "dtype: int64"
       ]
      },
-     "execution_count": 194,
+     "execution_count": 52,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1174,8 +1141,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 195,
-   "id": "cb24c2e2",
+   "execution_count": 53,
+   "id": "715ef53f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1184,17 +1151,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 196,
-   "id": "9b238f72",
+   "execution_count": 54,
+   "id": "a0b7ce55",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "4305"
+       "4330"
       ]
      },
-     "execution_count": 196,
+     "execution_count": 54,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1206,8 +1173,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 197,
-   "id": "f83ce15d",
+   "execution_count": 55,
+   "id": "a276ed04",
    "metadata": {},
    "outputs": [
     {
@@ -1221,7 +1188,7 @@
        "Name: project_id, dtype: int64"
       ]
      },
-     "execution_count": 197,
+     "execution_count": 55,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1232,7 +1199,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "3e7dd4af",
+   "id": "e9341b04",
    "metadata": {},
    "source": [
     "## Bring in pect_description for `Projects` -> Double Check\n",
@@ -1244,13 +1211,13 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 198,
-   "id": "33194243",
+   "execution_count": 56,
+   "id": "67c1fc59",
    "metadata": {},
    "outputs": [],
    "source": [
     "def load_pec(excel_file:str)-> pd.DataFrame:\n",
-    "    df = to_snakecase(pd.read_excel(excel_file))\n",
+    "    df = _utils.to_snakecase(pd.read_excel(excel_file))\n",
     "    \n",
     "    # Drop rows that are all nan\n",
     "    df = df.dropna(how='all').reset_index(drop =  True)\n",
@@ -1286,8 +1253,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 199,
-   "id": "96b9ab27",
+   "execution_count": 57,
+   "id": "a5ddf1bf",
    "metadata": {},
    "outputs": [
     {
@@ -1304,8 +1271,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 200,
-   "id": "da554116",
+   "execution_count": 58,
+   "id": "72100e15",
    "metadata": {},
    "outputs": [
     {
@@ -1314,7 +1281,7 @@
        "(799, 4)"
       ]
      },
-     "execution_count": 200,
+     "execution_count": 58,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1325,8 +1292,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 201,
-   "id": "9abb7310",
+   "execution_count": 59,
+   "id": "95554c2e",
    "metadata": {},
    "outputs": [
     {
@@ -1358,43 +1325,48 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>720</th>\n",
-       "      <td>4050203</td>\n",
-       "      <td>845</td>\n",
-       "      <td>Dumbarton Bridge RM1</td>\n",
-       "      <td>Toll Bridge Program</td>\n",
+       "      <th>682</th>\n",
+       "      <td>4050201</td>\n",
+       "      <td>151</td>\n",
+       "      <td>Drainage System Restoration</td>\n",
+       "      <td>State Hwy Operation &amp; Protection Program (SHOPP)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>476</th>\n",
-       "      <td>2080385</td>\n",
-       "      <td>851</td>\n",
-       "      <td>Reimbursement from BATA - Antioch</td>\n",
-       "      <td>Reimbursement of Toll Bridge Maintenance and Toll Collection Costs from the Bay Area Toll Authority ( BATA)</td>\n",
+       "      <th>494</th>\n",
+       "      <td>2080437</td>\n",
+       "      <td>0</td>\n",
+       "      <td>TMS Electrical Material Procurement</td>\n",
+       "      <td>Transportation Management System (TMS) Electrical Material Procurement</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>486</th>\n",
-       "      <td>2080410</td>\n",
-       "      <td>0</td>\n",
-       "      <td>Lighting</td>\n",
-       "      <td>Lighting</td>\n",
+       "      <th>331</th>\n",
+       "      <td>2030010</td>\n",
+       "      <td>630</td>\n",
+       "      <td>Rebuilding American Infrastructure with Sustainability and Equity (RAISE) and Multimodal Project Discretionary Grant Programs (e.g., INFRA, MEGA, RSTG or RURAL)</td>\n",
+       "      <td>Local Assistance</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "         pec  pect                   pect_description  \\\n",
-       "720  4050203   845               Dumbarton Bridge RM1   \n",
-       "476  2080385   851  Reimbursement from BATA - Antioch   \n",
-       "486  2080410     0                           Lighting   \n",
-       "\n",
-       "                                                                                                         program  \n",
-       "720                                                                                          Toll Bridge Program  \n",
-       "476  Reimbursement of Toll Bridge Maintenance and Toll Collection Costs from the Bay Area Toll Authority ( BATA)  \n",
-       "486                                                                                                     Lighting  "
+       "         pec  pect  \\\n",
+       "682  4050201   151   \n",
+       "494  2080437     0   \n",
+       "331  2030010   630   \n",
+       "\n",
+       "                                                                                                                                                      pect_description  \\\n",
+       "682                                                                                                                                        Drainage System Restoration   \n",
+       "494                                                                                                                                TMS Electrical Material Procurement   \n",
+       "331  Rebuilding American Infrastructure with Sustainability and Equity (RAISE) and Multimodal Project Discretionary Grant Programs (e.g., INFRA, MEGA, RSTG or RURAL)    \n",
+       "\n",
+       "                                                                    program  \n",
+       "682                        State Hwy Operation & Protection Program (SHOPP)  \n",
+       "494  Transportation Management System (TMS) Electrical Material Procurement  \n",
+       "331                                                        Local Assistance  "
       ]
      },
-     "execution_count": 201,
+     "execution_count": 59,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1405,7 +1377,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "4e3e39a6",
+   "id": "7ae7dade",
    "metadata": {},
    "source": [
     "### Turn this part to script once finalized"
@@ -1413,8 +1385,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 202,
-   "id": "9d07dc9c",
+   "execution_count": 60,
+   "id": "d16d49fe",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1423,8 +1395,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 203,
-   "id": "50498b4b",
+   "execution_count": 61,
+   "id": "1915efd6",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1433,8 +1405,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 204,
-   "id": "de116827",
+   "execution_count": 62,
+   "id": "5b161320",
    "metadata": {},
    "outputs": [
     {
@@ -1492,7 +1464,7 @@
        "0       -38.99             535  6200(024)  "
       ]
      },
-     "execution_count": 204,
+     "execution_count": 62,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1503,8 +1475,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 205,
-   "id": "f6915f79",
+   "execution_count": 63,
+   "id": "2b9bc1bf",
    "metadata": {},
    "outputs": [
     {
@@ -1544,45 +1516,45 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>7498</th>\n",
-       "      <td>0813000007</td>\n",
+       "      <th>1678</th>\n",
+       "      <td>0316000060</td>\n",
        "      <td>0890</td>\n",
        "      <td>2030010</td>\n",
-       "      <td>1112</td>\n",
-       "      <td>1645967.06</td>\n",
-       "      <td>1645967.06</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5954(108)</td>\n",
+       "      <td>2122</td>\n",
+       "      <td>200000.00</td>\n",
+       "      <td>62549.18</td>\n",
+       "      <td>820</td>\n",
+       "      <td>6203(069)</td>\n",
        "      <td>2030010</td>\n",
-       "      <td>300.00</td>\n",
-       "      <td>Highway Bridge</td>\n",
+       "      <td>820.00</td>\n",
+       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
        "      <td>Local Assistance</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>4227</th>\n",
-       "      <td>0517000187</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>2122</td>\n",
-       "      <td>82876.00</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>650</td>\n",
-       "      <td>32L0(084)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>650.00</td>\n",
-       "      <td>Emergency Relief (ER)</td>\n",
-       "      <td>Local Assistance</td>\n",
+       "      <th>6994</th>\n",
+       "      <td>0722000309</td>\n",
+       "      <td>3290</td>\n",
+       "      <td>2030720</td>\n",
+       "      <td>2021</td>\n",
+       "      <td>10000.00</td>\n",
+       "      <td>3520.00</td>\n",
+       "      <td>100</td>\n",
+       "      <td>5352(023)</td>\n",
+       "      <td>2030720</td>\n",
+       "      <td>100.00</td>\n",
+       "      <td>Active Transportation Program (ATP)</td>\n",
+       "      <td>Active Transportation Program (ATP)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2899</th>\n",
-       "      <td>0416000110</td>\n",
+       "      <th>2823</th>\n",
+       "      <td>0415000111</td>\n",
        "      <td>0890</td>\n",
        "      <td>2030010</td>\n",
-       "      <td>1516</td>\n",
-       "      <td>126000.00</td>\n",
-       "      <td>126000.00</td>\n",
+       "      <td>1617</td>\n",
+       "      <td>350000.00</td>\n",
+       "      <td>350000.00</td>\n",
        "      <td>300</td>\n",
-       "      <td>5094(065)</td>\n",
+       "      <td>6003(052)</td>\n",
        "      <td>2030010</td>\n",
        "      <td>300.00</td>\n",
        "      <td>Highway Bridge</td>\n",
@@ -1594,22 +1566,27 @@
       ],
       "text/plain": [
        "     adv_project_id fund_code pec_code appropriation_category_code  \\\n",
-       "7498     0813000007      0890  2030010                        1112   \n",
-       "4227     0517000187      0890  2030010                        2122   \n",
-       "2899     0416000110      0890  2030010                        1516   \n",
+       "1678     0316000060      0890  2030010                        2122   \n",
+       "6994     0722000309      3290  2030720                        2021   \n",
+       "2823     0415000111      0890  2030010                        1617   \n",
        "\n",
        "      curr_bud_am  cash_exp_am  pect_task_code project_id      pec   pect  \\\n",
-       "7498   1645967.06   1645967.06             300  5954(108)  2030010 300.00   \n",
-       "4227     82876.00         0.00             650  32L0(084)  2030010 650.00   \n",
-       "2899    126000.00    126000.00             300  5094(065)  2030010 300.00   \n",
+       "1678    200000.00     62549.18             820  6203(069)  2030010 820.00   \n",
+       "6994     10000.00      3520.00             100  5352(023)  2030720 100.00   \n",
+       "2823    350000.00    350000.00             300  6003(052)  2030010 300.00   \n",
        "\n",
-       "           pect_description           program  \n",
-       "7498        Highway Bridge   Local Assistance  \n",
-       "4227  Emergency Relief (ER)  Local Assistance  \n",
-       "2899        Highway Bridge   Local Assistance  "
+       "                                                    pect_description  \\\n",
+       "1678  Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
+       "6994                             Active Transportation Program (ATP)   \n",
+       "2823                                                 Highway Bridge    \n",
+       "\n",
+       "                                  program  \n",
+       "1678                     Local Assistance  \n",
+       "6994  Active Transportation Program (ATP)  \n",
+       "2823                     Local Assistance  "
       ]
      },
-     "execution_count": 205,
+     "execution_count": 63,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1620,17 +1597,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 206,
-   "id": "d17b923a",
+   "execution_count": 64,
+   "id": "f0749fec",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(10289, 4305)"
+       "(10186, 4330)"
       ]
      },
-     "execution_count": 206,
+     "execution_count": 64,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1641,8 +1618,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 207,
-   "id": "84a31505",
+   "execution_count": 65,
+   "id": "23164a0d",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1652,8 +1629,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 208,
-   "id": "863b113e",
+   "execution_count": 66,
+   "id": "af61c211",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1663,17 +1640,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 209,
-   "id": "8d33c12c",
+   "execution_count": 67,
+   "id": "26e84b16",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "10096"
+       "9999"
       ]
      },
-     "execution_count": 209,
+     "execution_count": 67,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1685,8 +1662,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 210,
-   "id": "27c3a8a5",
+   "execution_count": 68,
+   "id": "25132e11",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1695,8 +1672,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 211,
-   "id": "56fbc89c",
+   "execution_count": 69,
+   "id": "6b9911a3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1708,8 +1685,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 212,
-   "id": "f53db68f",
+   "execution_count": 70,
+   "id": "f4502e96",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1718,8 +1695,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 213,
-   "id": "62d990e9",
+   "execution_count": 71,
+   "id": "13866e8a",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1728,8 +1705,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 214,
-   "id": "d065468d",
+   "execution_count": 72,
+   "id": "e428d60c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1738,17 +1715,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 215,
-   "id": "182af512",
+   "execution_count": 73,
+   "id": "c23e02b1",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(4305, 4305, 4305)"
+       "(4330, 4330, 4330)"
       ]
      },
-     "execution_count": 215,
+     "execution_count": 73,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1759,18 +1736,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 216,
-   "id": "391cfc99",
+   "execution_count": 74,
+   "id": "9106c0ef",
    "metadata": {},
    "outputs": [],
    "source": [
-    "pect_df3 = to_snakecase(pect_df3)"
+    "pect_df3 = _utils.to_snakecase(pect_df3)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 217,
-   "id": "98a7e1a6",
+   "execution_count": 75,
+   "id": "653ba5a5",
    "metadata": {
     "scrolled": true
    },
@@ -1783,8 +1760,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 218,
-   "id": "6a5e322b",
+   "execution_count": 76,
+   "id": "eed16ac2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1793,8 +1770,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 219,
-   "id": "1e767e54",
+   "execution_count": 77,
+   "id": "be991521",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1804,21 +1781,21 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 220,
-   "id": "864532fc",
+   "execution_count": 78,
+   "id": "6c7b81c5",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
        "_merge    \n",
-       "right_only    7463\n",
-       "both          4305\n",
+       "right_only    6942\n",
+       "both          4330\n",
        "left_only        0\n",
        "dtype: int64"
       ]
      },
-     "execution_count": 220,
+     "execution_count": 78,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1829,8 +1806,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 221,
-   "id": "499e4dcc",
+   "execution_count": 79,
+   "id": "f7e7d7f3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1841,8 +1818,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 222,
-   "id": "4c59d9e9",
+   "execution_count": 80,
+   "id": "034acf09",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1852,17 +1829,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 223,
-   "id": "83765a97",
+   "execution_count": 81,
+   "id": "ad74eafc",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "11768"
+       "11272"
       ]
      },
-     "execution_count": 223,
+     "execution_count": 81,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1873,8 +1850,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 224,
-   "id": "a02d34e9",
+   "execution_count": 82,
+   "id": "6e2873ec",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -1884,7 +1861,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "b08e0591",
+   "id": "d414f29a",
    "metadata": {},
    "source": [
     "### Double check"
@@ -1892,8 +1869,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 225,
-   "id": "d7bb56fa",
+   "execution_count": 83,
+   "id": "d2561ecb",
    "metadata": {},
    "outputs": [
     {
@@ -1924,25 +1901,25 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>1610</th>\n",
+       "      <th>1615</th>\n",
        "      <td>Highway Bridge</td>\n",
        "      <td>690839.49</td>\n",
        "      <td>5918(101)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1611</th>\n",
+       "      <th>1616</th>\n",
        "      <td>Earmarks Projects (HPP, DEMO CPFCDS, etc.)</td>\n",
        "      <td>238679.79</td>\n",
        "      <td>5918(101)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1612</th>\n",
+       "      <th>1617</th>\n",
        "      <td>Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP)</td>\n",
        "      <td>0.00</td>\n",
        "      <td>5918(101)</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1613</th>\n",
+       "      <th>1618</th>\n",
        "      <td>Highway Bridge</td>\n",
        "      <td>472887.51</td>\n",
        "      <td>5918(101)</td>\n",
@@ -1953,19 +1930,19 @@
       ],
       "text/plain": [
        "                                                                                           pect_description  \\\n",
-       "1610                                                                                        Highway Bridge    \n",
-       "1611                                                             Earmarks Projects (HPP, DEMO CPFCDS, etc.)   \n",
-       "1612  Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP)   \n",
-       "1613                                                                                        Highway Bridge    \n",
+       "1615                                                                                        Highway Bridge    \n",
+       "1616                                                             Earmarks Projects (HPP, DEMO CPFCDS, etc.)   \n",
+       "1617  Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP)   \n",
+       "1618                                                                                        Highway Bridge    \n",
        "\n",
        "      curr_bud_am project_id  \n",
-       "1610    690839.49  5918(101)  \n",
-       "1611    238679.79  5918(101)  \n",
-       "1612         0.00  5918(101)  \n",
-       "1613    472887.51  5918(101)  "
+       "1615    690839.49  5918(101)  \n",
+       "1616    238679.79  5918(101)  \n",
+       "1617         0.00  5918(101)  \n",
+       "1618    472887.51  5918(101)  "
       ]
      },
-     "execution_count": 225,
+     "execution_count": 83,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1976,93 +1953,93 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 226,
-   "id": "8c1f5e90",
+   "execution_count": 84,
+   "id": "50113d38",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
        "<style  type=\"text/css\" >\n",
-       "#T_f19e5_row0_col29,#T_f19e5_row0_col36,#T_f19e5_row0_col47{\n",
+       "#T_3da78_row0_col29,#T_3da78_row0_col36,#T_3da78_row0_col47{\n",
        "            color:  red;\n",
-       "        }</style><table id=\"T_f19e5_\" ><thead>    <tr>        <th class=\"blank level0\" ></th>        <th class=\"col_heading level0 col0\" >project_id</th>        <th class=\"col_heading level0 col1\" >comment_desc</th>        <th class=\"col_heading level0 col2\" >district_code</th>        <th class=\"col_heading level0 col3\" >est_total_prj_costs</th>        <th class=\"col_heading level0 col4\" >location_name</th>        <th class=\"col_heading level0 col5\" >project_label_name</th>        <th class=\"col_heading level0 col6\" >original_post_mile_begin_id</th>        <th class=\"col_heading level0 col7\" >original_post_mile_end_id</th>        <th class=\"col_heading level0 col8\" >revised_post_mile_begin_ind</th>        <th class=\"col_heading level0 col9\" >revised_post_mile_end_ind</th>        <th class=\"col_heading level0 col10\" >route_name</th>        <th class=\"col_heading level0 col11\" >state_hwy_ind</th>        <th class=\"col_heading level0 col12\" >senate_district_code</th>        <th class=\"col_heading level0 col13\" >update_date_time</th>        <th class=\"col_heading level0 col14\" >agency_name</th>        <th class=\"col_heading level0 col15\" >urban_area_code</th>        <th class=\"col_heading level0 col16\" >county_name</th>        <th class=\"col_heading level0 col17\" >work_type_desc</th>        <th class=\"col_heading level0 col18\" >category_desc</th>        <th class=\"col_heading level0 col19\" >current_phase</th>        <th class=\"col_heading level0 col20\" >active_transportation_program_(atp)</th>        <th class=\"col_heading level0 col21\" >bridge_inspection_&_scour_evaluation</th>        <th class=\"col_heading level0 col22\" >covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation</th>        <th class=\"col_heading level0 col23\" >carbon_reduction_program_(crp)</th>        <th class=\"col_heading level0 col24\" >congestion_mitigation_&_air_quality_improvement_program_(cmaq)</th>        <th class=\"col_heading level0 col25\" >coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds</th>        <th class=\"col_heading level0 col26\" >corridor_mobility_improvement_account_(cmia)_program</th>        <th class=\"col_heading level0 col27\" >county_exchange_funds</th>        <th class=\"col_heading level0 col28\" >county_state_match_program</th>        <th class=\"col_heading level0 col29\" >earmarks_projects_(hpp,_demo_cpfcds,_etc.)</th>        <th class=\"col_heading level0 col30\" >emergency_relief_(er)</th>        <th class=\"col_heading level0 col31\" >ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program</th>        <th class=\"col_heading level0 col32\" >funds_for_planning,_programming_and_monitoring_-_rip</th>        <th class=\"col_heading level0 col33\" >general_funded_designated_programs</th>        <th class=\"col_heading level0 col34\" >hazard_elimination_safety_(hes)</th>        <th class=\"col_heading level0 col35\" >high_risk_rural_roads_program_(hr3)</th>        <th class=\"col_heading level0 col36\" >highway_bridge_</th>        <th class=\"col_heading level0 col37\" >highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund</th>        <th class=\"col_heading level0 col38\" >highway_safety_improvement_program_(hsip)_(non-infrastructure)</th>        <th class=\"col_heading level0 col39\" >highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund</th>        <th class=\"col_heading level0 col40\" >local_partnership_program_(lpp_–_competitive)_</th>        <th class=\"col_heading level0 col41\" >local_roads</th>        <th class=\"col_heading level0 col42\" >local_roads_rehabilitation</th>        <th class=\"col_heading level0 col43\" >railroad_grade_crossing_protection</th>        <th class=\"col_heading level0 col44\" >railroad_grade_separations</th>        <th class=\"col_heading level0 col45\" >rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_</th>        <th class=\"col_heading level0 col46\" >regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system)</th>        <th class=\"col_heading level0 col47\" >regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip)</th>        <th class=\"col_heading level0 col48\" >regional_transportation_planning_agency_(rtpa)_stp_match_exchange</th>        <th class=\"col_heading level0 col49\" >sb1_funded_freeway_service_patrol</th>        <th class=\"col_heading level0 col50\" >shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds</th>        <th class=\"col_heading level0 col51\" >safe_routes_to_school_(sr2s_and_srts)</th>        <th class=\"col_heading level0 col52\" >set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act</th>        <th class=\"col_heading level0 col53\" >solutions_for_congested_corridors_program_(sccp)</th>        <th class=\"col_heading level0 col54\" >special_programs</th>        <th class=\"col_heading level0 col55\" >state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic)</th>        <th class=\"col_heading level0 col56\" >structures_seismic_retrofit_</th>        <th class=\"col_heading level0 col57\" >trade_corridor_enhancement_account_(tcea)_programs_–_local_share</th>        <th class=\"col_heading level0 col58\" >trade_corridor_enhancement_account_(tcea)_programs_–_state_share</th>        <th class=\"col_heading level0 col59\" >trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads</th>        <th class=\"col_heading level0 col60\" >traffic_congestion_relief_program_(_tcrp_)</th>        <th class=\"col_heading level0 col61\" >unknown</th>    </tr></thead><tbody>\n",
+       "        }</style><table id=\"T_3da78_\" ><thead>    <tr>        <th class=\"blank level0\" ></th>        <th class=\"col_heading level0 col0\" >project_id</th>        <th class=\"col_heading level0 col1\" >comment_desc</th>        <th class=\"col_heading level0 col2\" >district_code</th>        <th class=\"col_heading level0 col3\" >est_total_prj_costs</th>        <th class=\"col_heading level0 col4\" >location_name</th>        <th class=\"col_heading level0 col5\" >project_label_name</th>        <th class=\"col_heading level0 col6\" >original_post_mile_begin_id</th>        <th class=\"col_heading level0 col7\" >original_post_mile_end_id</th>        <th class=\"col_heading level0 col8\" >revised_post_mile_begin_ind</th>        <th class=\"col_heading level0 col9\" >revised_post_mile_end_ind</th>        <th class=\"col_heading level0 col10\" >route_name</th>        <th class=\"col_heading level0 col11\" >state_hwy_ind</th>        <th class=\"col_heading level0 col12\" >senate_district_code</th>        <th class=\"col_heading level0 col13\" >update_date_time</th>        <th class=\"col_heading level0 col14\" >agency_name</th>        <th class=\"col_heading level0 col15\" >urban_area_code</th>        <th class=\"col_heading level0 col16\" >county_name</th>        <th class=\"col_heading level0 col17\" >work_type_desc</th>        <th class=\"col_heading level0 col18\" >category_desc</th>        <th class=\"col_heading level0 col19\" >current_phase</th>        <th class=\"col_heading level0 col20\" >active_transportation_program_(atp)</th>        <th class=\"col_heading level0 col21\" >bridge_inspection_&_scour_evaluation</th>        <th class=\"col_heading level0 col22\" >covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation</th>        <th class=\"col_heading level0 col23\" >carbon_reduction_program_(crp)</th>        <th class=\"col_heading level0 col24\" >congestion_mitigation_&_air_quality_improvement_program_(cmaq)</th>        <th class=\"col_heading level0 col25\" >coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds</th>        <th class=\"col_heading level0 col26\" >corridor_mobility_improvement_account_(cmia)_program</th>        <th class=\"col_heading level0 col27\" >county_exchange_funds</th>        <th class=\"col_heading level0 col28\" >county_state_match_program</th>        <th class=\"col_heading level0 col29\" >earmarks_projects_(hpp,_demo_cpfcds,_etc.)</th>        <th class=\"col_heading level0 col30\" >emergency_relief_(er)</th>        <th class=\"col_heading level0 col31\" >ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program</th>        <th class=\"col_heading level0 col32\" >funds_for_planning,_programming_and_monitoring_-_rip</th>        <th class=\"col_heading level0 col33\" >general_funded_designated_programs</th>        <th class=\"col_heading level0 col34\" >hazard_elimination_safety_(hes)</th>        <th class=\"col_heading level0 col35\" >high_risk_rural_roads_program_(hr3)</th>        <th class=\"col_heading level0 col36\" >highway_bridge_</th>        <th class=\"col_heading level0 col37\" >highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund</th>        <th class=\"col_heading level0 col38\" >highway_safety_improvement_program_(hsip)_(non-infrastructure)</th>        <th class=\"col_heading level0 col39\" >highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund</th>        <th class=\"col_heading level0 col40\" >local_partnership_program_(lpp_–_competitive)_</th>        <th class=\"col_heading level0 col41\" >local_roads</th>        <th class=\"col_heading level0 col42\" >local_roads_rehabilitation</th>        <th class=\"col_heading level0 col43\" >railroad_grade_crossing_protection</th>        <th class=\"col_heading level0 col44\" >railroad_grade_separations</th>        <th class=\"col_heading level0 col45\" >rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_</th>        <th class=\"col_heading level0 col46\" >regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system)</th>        <th class=\"col_heading level0 col47\" >regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip)</th>        <th class=\"col_heading level0 col48\" >regional_transportation_planning_agency_(rtpa)_stp_match_exchange</th>        <th class=\"col_heading level0 col49\" >sb1_funded_freeway_service_patrol</th>        <th class=\"col_heading level0 col50\" >shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds</th>        <th class=\"col_heading level0 col51\" >safe_routes_to_school_(sr2s_and_srts)</th>        <th class=\"col_heading level0 col52\" >set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act</th>        <th class=\"col_heading level0 col53\" >solutions_for_congested_corridors_program_(sccp)</th>        <th class=\"col_heading level0 col54\" >special_programs</th>        <th class=\"col_heading level0 col55\" >state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic)</th>        <th class=\"col_heading level0 col56\" >structures_seismic_retrofit_</th>        <th class=\"col_heading level0 col57\" >trade_corridor_enhancement_account_(tcea)_programs_–_local_share</th>        <th class=\"col_heading level0 col58\" >trade_corridor_enhancement_account_(tcea)_programs_–_state_share</th>        <th class=\"col_heading level0 col59\" >trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads</th>        <th class=\"col_heading level0 col60\" >traffic_congestion_relief_program_(_tcrp_)</th>        <th class=\"col_heading level0 col61\" >unknown</th>    </tr></thead><tbody>\n",
        "                <tr>\n",
-       "                        <th id=\"T_f19e5_level0_row0\" class=\"row_heading level0 row0\" >1413</th>\n",
-       "                        <td id=\"T_f19e5_row0_col0\" class=\"data row0 col0\" >5918(101)</td>\n",
-       "                        <td id=\"T_f19e5_row0_col1\" class=\"data row0 col1\" >4-26-2023: told Neal Hay to do a BAR request and that he cannot  ask for more than what was lapsed - JC\n",
+       "                        <th id=\"T_3da78_level0_row0\" class=\"row_heading level0 row0\" >1277</th>\n",
+       "                        <td id=\"T_3da78_row0_col0\" class=\"data row0 col0\" >5918(101)</td>\n",
+       "                        <td id=\"T_3da78_row0_col1\" class=\"data row0 col1\" >4-26-2023: told Neal Hay to do a BAR request and that he cannot  ask for more than what was lapsed - JC\n",
        "\n",
        "1/10/22: TCT JWalton adv of CWA expring and funds lapsing.  need invoice by Apr 1, 2022.  ab\n",
        "8/2/17: email SRiddle re: inactive status.  ab\n",
        "County will seek to replace (SR= 53.6)</td>\n",
-       "                        <td id=\"T_f19e5_row0_col2\" class=\"data row0 col2\" >03</td>\n",
-       "                        <td id=\"T_f19e5_row0_col3\" class=\"data row0 col3\" >nan</td>\n",
-       "                        <td id=\"T_f19e5_row0_col4\" class=\"data row0 col4\" >On Howsley Road, 1.02 Mile East of State Route 99, Br</td>\n",
-       "                        <td id=\"T_f19e5_row0_col5\" class=\"data row0 col5\" >Bridge Replacement</td>\n",
-       "                        <td id=\"T_f19e5_row0_col6\" class=\"data row0 col6\" >nan</td>\n",
-       "                        <td id=\"T_f19e5_row0_col7\" class=\"data row0 col7\" >nan</td>\n",
-       "                        <td id=\"T_f19e5_row0_col8\" class=\"data row0 col8\" >None</td>\n",
-       "                        <td id=\"T_f19e5_row0_col9\" class=\"data row0 col9\" >None</td>\n",
-       "                        <td id=\"T_f19e5_row0_col10\" class=\"data row0 col10\" >0-CR</td>\n",
-       "                        <td id=\"T_f19e5_row0_col11\" class=\"data row0 col11\" >N</td>\n",
-       "                        <td id=\"T_f19e5_row0_col12\" class=\"data row0 col12\" >None</td>\n",
-       "                        <td id=\"T_f19e5_row0_col13\" class=\"data row0 col13\" >2023-04-26 15:16:25</td>\n",
-       "                        <td id=\"T_f19e5_row0_col14\" class=\"data row0 col14\" >Sutter County</td>\n",
-       "                        <td id=\"T_f19e5_row0_col15\" class=\"data row0 col15\" >None</td>\n",
-       "                        <td id=\"T_f19e5_row0_col16\" class=\"data row0 col16\" >Sutter County</td>\n",
-       "                        <td id=\"T_f19e5_row0_col17\" class=\"data row0 col17\" >Bridge Replacement - No Added Capacity</td>\n",
-       "                        <td id=\"T_f19e5_row0_col18\" class=\"data row0 col18\" >Bridge Replacement</td>\n",
-       "                        <td id=\"T_f19e5_row0_col19\" class=\"data row0 col19\" >single phase</td>\n",
-       "                        <td id=\"T_f19e5_row0_col20\" class=\"data row0 col20\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col21\" class=\"data row0 col21\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col22\" class=\"data row0 col22\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col23\" class=\"data row0 col23\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col24\" class=\"data row0 col24\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col25\" class=\"data row0 col25\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col26\" class=\"data row0 col26\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col27\" class=\"data row0 col27\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col28\" class=\"data row0 col28\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col29\" class=\"data row0 col29\" >Yes</td>\n",
-       "                        <td id=\"T_f19e5_row0_col30\" class=\"data row0 col30\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col31\" class=\"data row0 col31\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col32\" class=\"data row0 col32\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col33\" class=\"data row0 col33\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col34\" class=\"data row0 col34\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col35\" class=\"data row0 col35\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col36\" class=\"data row0 col36\" >Yes</td>\n",
-       "                        <td id=\"T_f19e5_row0_col37\" class=\"data row0 col37\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col38\" class=\"data row0 col38\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col39\" class=\"data row0 col39\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col40\" class=\"data row0 col40\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col41\" class=\"data row0 col41\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col42\" class=\"data row0 col42\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col43\" class=\"data row0 col43\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col44\" class=\"data row0 col44\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col45\" class=\"data row0 col45\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col46\" class=\"data row0 col46\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col47\" class=\"data row0 col47\" >Yes</td>\n",
-       "                        <td id=\"T_f19e5_row0_col48\" class=\"data row0 col48\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col49\" class=\"data row0 col49\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col50\" class=\"data row0 col50\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col51\" class=\"data row0 col51\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col52\" class=\"data row0 col52\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col53\" class=\"data row0 col53\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col54\" class=\"data row0 col54\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col55\" class=\"data row0 col55\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col56\" class=\"data row0 col56\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col57\" class=\"data row0 col57\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col58\" class=\"data row0 col58\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col59\" class=\"data row0 col59\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col60\" class=\"data row0 col60\" >No</td>\n",
-       "                        <td id=\"T_f19e5_row0_col61\" class=\"data row0 col61\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col2\" class=\"data row0 col2\" >03</td>\n",
+       "                        <td id=\"T_3da78_row0_col3\" class=\"data row0 col3\" >nan</td>\n",
+       "                        <td id=\"T_3da78_row0_col4\" class=\"data row0 col4\" >On Howsley Road, 1.02 Mile East of State Route 99, Br</td>\n",
+       "                        <td id=\"T_3da78_row0_col5\" class=\"data row0 col5\" >Bridge Replacement</td>\n",
+       "                        <td id=\"T_3da78_row0_col6\" class=\"data row0 col6\" >nan</td>\n",
+       "                        <td id=\"T_3da78_row0_col7\" class=\"data row0 col7\" >nan</td>\n",
+       "                        <td id=\"T_3da78_row0_col8\" class=\"data row0 col8\" >None</td>\n",
+       "                        <td id=\"T_3da78_row0_col9\" class=\"data row0 col9\" >None</td>\n",
+       "                        <td id=\"T_3da78_row0_col10\" class=\"data row0 col10\" >0-CR</td>\n",
+       "                        <td id=\"T_3da78_row0_col11\" class=\"data row0 col11\" >N</td>\n",
+       "                        <td id=\"T_3da78_row0_col12\" class=\"data row0 col12\" >None</td>\n",
+       "                        <td id=\"T_3da78_row0_col13\" class=\"data row0 col13\" >2023-04-26 15:16:25</td>\n",
+       "                        <td id=\"T_3da78_row0_col14\" class=\"data row0 col14\" >Sutter County</td>\n",
+       "                        <td id=\"T_3da78_row0_col15\" class=\"data row0 col15\" >None</td>\n",
+       "                        <td id=\"T_3da78_row0_col16\" class=\"data row0 col16\" >Sutter County</td>\n",
+       "                        <td id=\"T_3da78_row0_col17\" class=\"data row0 col17\" >Bridge Replacement - No Added Capacity</td>\n",
+       "                        <td id=\"T_3da78_row0_col18\" class=\"data row0 col18\" >Bridge Replacement</td>\n",
+       "                        <td id=\"T_3da78_row0_col19\" class=\"data row0 col19\" >single phase</td>\n",
+       "                        <td id=\"T_3da78_row0_col20\" class=\"data row0 col20\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col21\" class=\"data row0 col21\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col22\" class=\"data row0 col22\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col23\" class=\"data row0 col23\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col24\" class=\"data row0 col24\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col25\" class=\"data row0 col25\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col26\" class=\"data row0 col26\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col27\" class=\"data row0 col27\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col28\" class=\"data row0 col28\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col29\" class=\"data row0 col29\" >Yes</td>\n",
+       "                        <td id=\"T_3da78_row0_col30\" class=\"data row0 col30\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col31\" class=\"data row0 col31\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col32\" class=\"data row0 col32\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col33\" class=\"data row0 col33\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col34\" class=\"data row0 col34\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col35\" class=\"data row0 col35\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col36\" class=\"data row0 col36\" >Yes</td>\n",
+       "                        <td id=\"T_3da78_row0_col37\" class=\"data row0 col37\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col38\" class=\"data row0 col38\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col39\" class=\"data row0 col39\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col40\" class=\"data row0 col40\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col41\" class=\"data row0 col41\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col42\" class=\"data row0 col42\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col43\" class=\"data row0 col43\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col44\" class=\"data row0 col44\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col45\" class=\"data row0 col45\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col46\" class=\"data row0 col46\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col47\" class=\"data row0 col47\" >Yes</td>\n",
+       "                        <td id=\"T_3da78_row0_col48\" class=\"data row0 col48\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col49\" class=\"data row0 col49\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col50\" class=\"data row0 col50\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col51\" class=\"data row0 col51\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col52\" class=\"data row0 col52\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col53\" class=\"data row0 col53\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col54\" class=\"data row0 col54\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col55\" class=\"data row0 col55\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col56\" class=\"data row0 col56\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col57\" class=\"data row0 col57\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col58\" class=\"data row0 col58\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col59\" class=\"data row0 col59\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col60\" class=\"data row0 col60\" >No</td>\n",
+       "                        <td id=\"T_3da78_row0_col61\" class=\"data row0 col61\" >No</td>\n",
        "            </tr>\n",
        "    </tbody></table>"
       ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x25bd9b0dd48>"
+       "<pandas.io.formats.style.Styler at 0x1d6d5024948>"
       ]
      },
-     "execution_count": 226,
+     "execution_count": 84,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2073,7 +2050,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e2d50c21",
+   "id": "734fa9d5",
    "metadata": {},
    "source": [
     "## Phase_Funding Table"
@@ -2081,7 +2058,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "ebf0c60f",
+   "id": "8d673df6",
    "metadata": {},
    "source": [
     "### Bring in fund_code\n",
@@ -2090,8 +2067,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 227,
-   "id": "8a344942",
+   "execution_count": 85,
+   "id": "38f9b021",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2107,8 +2084,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 228,
-   "id": "7344566e",
+   "execution_count": 86,
+   "id": "cf667c4c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2117,8 +2094,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 229,
-   "id": "c8bc42a1",
+   "execution_count": 87,
+   "id": "eed59562",
    "metadata": {},
    "outputs": [
     {
@@ -2148,32 +2125,32 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>0183</td>\n",
-       "      <td>Environmental Enhanc &amp; Mitigat Prgm Fd</td>\n",
+       "      <th>12</th>\n",
+       "      <td>6056</td>\n",
+       "      <td>Trade Corridors Improvement Fund</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>10</th>\n",
-       "      <td>3291</td>\n",
-       "      <td>Trade Corridor Enhancement Account, STF</td>\n",
+       "      <th>5</th>\n",
+       "      <td>3007</td>\n",
+       "      <td>Traffic Congestion Relief Fund</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>0045</td>\n",
-       "      <td>Bicycle Transportation Account</td>\n",
+       "      <th>3</th>\n",
+       "      <td>0183</td>\n",
+       "      <td>Environmental Enhanc &amp; Mitigat Prgm Fd</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "    0001                             general_fund\n",
-       "3   0183   Environmental Enhanc & Mitigat Prgm Fd\n",
-       "10  3291  Trade Corridor Enhancement Account, STF\n",
-       "1   0045           Bicycle Transportation Account"
+       "    0001                            general_fund\n",
+       "12  6056        Trade Corridors Improvement Fund\n",
+       "5   3007          Traffic Congestion Relief Fund\n",
+       "3   0183  Environmental Enhanc & Mitigat Prgm Fd"
       ]
      },
-     "execution_count": 229,
+     "execution_count": 87,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2184,8 +2161,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 230,
-   "id": "d5432747",
+   "execution_count": 88,
+   "id": "990c4044",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2195,17 +2172,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 231,
-   "id": "9d010bfb",
+   "execution_count": 89,
+   "id": "2d4e7ec1",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "4305"
+       "4330"
       ]
      },
-     "execution_count": 231,
+     "execution_count": 89,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2216,8 +2193,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 232,
-   "id": "acaf7b88",
+   "execution_count": 90,
+   "id": "67589974",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2226,32 +2203,32 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 233,
-   "id": "4e9538f6",
+   "execution_count": 91,
+   "id": "bc661a10",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Federal Trust Fund                                   7647\n",
-       "State Highway Account                                1575\n",
-       "Road Maintenance & Rehabilitation Account, STF        433\n",
-       "Unknown                                               402\n",
-       "Local Bridge Seismic Retrofit Acct                     73\n",
+       "Federal Trust Fund                                   7464\n",
+       "State Highway Account                                1563\n",
+       "Unknown                                               505\n",
+       "Road Maintenance & Rehabilitation Account, STF        424\n",
+       "Local Bridge Seismic Retrofit Acct                     72\n",
        "Environmental Enhanc & Mitigat Prgm Fd                 56\n",
-       "Transportation Investment Fund                         29\n",
+       "Transportation Investment Fund                         28\n",
        "Transportation Deferred Investment Fund                19\n",
        "Trade Corridor Enhancement Account, STF                18\n",
        "Trade Corridors Improvement Fund                       13\n",
        "Traffic Congestion Relief Fund                          8\n",
-       "State-Local Partnership Program Acct                    7\n",
        "Highway Safety,Rehabilitation,& Preservation Acct       7\n",
-       "Corridor Mobility Improvement Account                   1\n",
+       "State-Local Partnership Program Acct                    7\n",
        "Transportation Faciilities Account                      1\n",
+       "Corridor Mobility Improvement Account                   1\n",
        "Name: general_fund, dtype: int64"
       ]
      },
-     "execution_count": 233,
+     "execution_count": 91,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2262,8 +2239,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 288,
-   "id": "1fb223cf",
+   "execution_count": 92,
+   "id": "cc3fbc10",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2280,8 +2257,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 235,
-   "id": "8a90d8e7",
+   "execution_count": 93,
+   "id": "f1f73846",
    "metadata": {},
    "outputs": [
     {
@@ -2342,10 +2319,10 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>15A5(001)</td>\n",
+       "      <td>15A5(013)</td>\n",
        "      <td>Federal Trust Fund</td>\n",
-       "      <td>849820.30</td>\n",
-       "      <td>700737.01</td>\n",
+       "      <td>172633.00</td>\n",
+       "      <td>0.00</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2357,10 +2334,10 @@
        "1  0014(005)  Federal Trust Fund          879983.23                 879983.23\n",
        "2  0027(012)  Federal Trust Fund        12830458.87               12830458.87\n",
        "3  0061(025)  Federal Trust Fund         2595722.00                2595722.00\n",
-       "4  15A5(001)  Federal Trust Fund          849820.30                 700737.01"
+       "4  15A5(013)  Federal Trust Fund          172633.00                      0.00"
       ]
      },
-     "execution_count": 235,
+     "execution_count": 93,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2371,8 +2348,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 236,
-   "id": "7beb8071",
+   "execution_count": 94,
+   "id": "6060dc89",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2389,8 +2366,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 237,
-   "id": "8feb664b",
+   "execution_count": 95,
+   "id": "108ba2d2",
    "metadata": {},
    "outputs": [
     {
@@ -2446,9 +2423,9 @@
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>15A5(001)</td>\n",
-       "      <td>849820.30</td>\n",
-       "      <td>700737.01</td>\n",
+       "      <td>15A5(013)</td>\n",
+       "      <td>172633.00</td>\n",
+       "      <td>0.00</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -2460,10 +2437,10 @@
        "1  0014(005)          879983.23                     879983.23\n",
        "2  0027(012)        12830458.87                   12830458.87\n",
        "3  0061(025)         2595722.00                    2595722.00\n",
-       "4  15A5(001)          849820.30                     700737.01"
+       "4  15A5(013)          172633.00                          0.00"
       ]
      },
-     "execution_count": 237,
+     "execution_count": 95,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2474,17 +2451,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 238,
-   "id": "694c3ea3",
+   "execution_count": 96,
+   "id": "499a97b5",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "4305"
+       "4330"
       ]
      },
-     "execution_count": 238,
+     "execution_count": 96,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2495,8 +2472,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 239,
-   "id": "def28835",
+   "execution_count": 97,
+   "id": "ad620b44",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2508,8 +2485,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 240,
-   "id": "e32b1394",
+   "execution_count": 98,
+   "id": "4b8ac706",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2518,8 +2495,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 241,
-   "id": "6647d17e",
+   "execution_count": 99,
+   "id": "5dc3d5e0",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2528,18 +2505,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 242,
-   "id": "5f806e1c",
+   "execution_count": 100,
+   "id": "e53fe275",
    "metadata": {},
    "outputs": [],
    "source": [
-    "fund_phase_df_pivot1 = to_snakecase(fund_phase_df_pivot1)"
+    "fund_phase_df_pivot1 = _utils.to_snakecase(fund_phase_df_pivot1)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 243,
-   "id": "871c217b",
+   "execution_count": 101,
+   "id": "4491ab40",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2549,8 +2526,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 244,
-   "id": "52c83028",
+   "execution_count": 102,
+   "id": "0e29b5dc",
    "metadata": {},
    "outputs": [
     {
@@ -2572,7 +2549,7 @@
        " 'transportation_investment_fund']"
       ]
      },
-     "execution_count": 244,
+     "execution_count": 102,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2583,8 +2560,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 245,
-   "id": "2139367b",
+   "execution_count": 103,
+   "id": "65359dc1",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2594,8 +2571,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 246,
-   "id": "590778fd",
+   "execution_count": 104,
+   "id": "e0568769",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2605,8 +2582,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 247,
-   "id": "2f7f8d5d",
+   "execution_count": 105,
+   "id": "5c6f2da7",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2616,8 +2593,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 248,
-   "id": "84f661b8",
+   "execution_count": 106,
+   "id": "efd1406e",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2626,8 +2603,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 249,
-   "id": "b01be19d",
+   "execution_count": 107,
+   "id": "de4dd1bf",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2638,8 +2615,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 250,
-   "id": "b92aa697",
+   "execution_count": 108,
+   "id": "03047890",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -2649,17 +2626,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 251,
-   "id": "feccd5ab",
+   "execution_count": 109,
+   "id": "d81411fd",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(4305, 20)"
+       "(4330, 20)"
       ]
      },
-     "execution_count": 251,
+     "execution_count": 109,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2670,17 +2647,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 252,
-   "id": "c6b364da",
+   "execution_count": 110,
+   "id": "e98e4b0a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "4305"
+       "4330"
       ]
      },
-     "execution_count": 252,
+     "execution_count": 110,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -2691,357 +2668,63 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 253,
-   "id": "49267b9b",
+   "execution_count": 111,
+   "id": "417a220f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Tag whether something is funded by state/federal/both\n",
-    "def is_state_funds(row):\n",
-    "    if row.total_state_funds > 0:\n",
-    "        return \"Yes\"\n",
-    "    else:\n",
-    "        return \"No\""
+    "final_fund_phase_df[\"is_state\"] = final_fund_phase_df.apply(_utils.is_state_funds, axis=1)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 254,
-   "id": "bede5e25",
+   "execution_count": 112,
+   "id": "04ad9e81",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def is_fed_funds(row):\n",
-    "    if row.total_federal_funds > 0:\n",
-    "        return \"Yes\"\n",
-    "    else:\n",
-    "        return \"No\""
+    "final_fund_phase_df[\"is_federal\"] = final_fund_phase_df.apply(_utils.is_fed_funds, axis=1)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 255,
-   "id": "a08995e3",
+   "execution_count": 113,
+   "id": "e6911d49",
    "metadata": {},
    "outputs": [],
    "source": [
-    "final_fund_phase_df[\"is_state\"] = final_fund_phase_df.apply(is_state_funds, axis=1)"
+    "final_fund_phase_df = final_fund_phase_df.fillna(0)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 256,
-   "id": "13f768cb",
+   "execution_count": 114,
+   "id": "85bdef30",
    "metadata": {},
    "outputs": [],
    "source": [
-    "final_fund_phase_df[\"is_federal\"] = final_fund_phase_df.apply(is_fed_funds, axis=1)"
+    "to_keep = ['project_id',  'single_phase_cost',\n",
+    "       'single_phase_expenditure_amt', 'total_state_funds','total_federal_funds', 'is_state',\n",
+    "       'is_federal']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 257,
-   "id": "bd42f0c3",
+   "execution_count": 115,
+   "id": "9a936952",
    "metadata": {},
    "outputs": [],
    "source": [
-    "final_fund_phase_df = final_fund_phase_df.fillna(0)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a8e168da",
-   "metadata": {},
-   "source": [
-    "### Double Checking\n",
-    "* Make sure the project flag is correct"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 258,
-   "id": "c7651df6",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "6211(130)    32\n",
-       "5908(031)    28\n",
-       "6053(130)    27\n",
-       "6211(131)    27\n",
-       "5006(219)    23\n",
-       "Name: project_id, dtype: int64"
-      ]
-     },
-     "execution_count": 258,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "fund_phase_df.project_id.value_counts().head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 259,
-   "id": "ca87e3a0",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<style  type=\"text/css\" >\n",
-       "#T_2b285_row0_col20,#T_2b285_row0_col21{\n",
-       "            color:  red;\n",
-       "        }</style><table id=\"T_2b285_\" ><thead>    <tr>        <th class=\"blank level0\" ></th>        <th class=\"col_heading level0 col0\" >project_id</th>        <th class=\"col_heading level0 col1\" >corridor_mobility_improvement_account</th>        <th class=\"col_heading level0 col2\" >environmental_enhanc_&_mitigat_prgm_fd</th>        <th class=\"col_heading level0 col3\" >federal_trust_fund</th>        <th class=\"col_heading level0 col4\" >highway_safety,rehabilitation,&_preservation_acct</th>        <th class=\"col_heading level0 col5\" >local_bridge_seismic_retrofit_acct</th>        <th class=\"col_heading level0 col6\" >road_maintenance_&_rehabilitation_account,_stf</th>        <th class=\"col_heading level0 col7\" >state_highway_account</th>        <th class=\"col_heading level0 col8\" >state-local_partnership_program_acct</th>        <th class=\"col_heading level0 col9\" >trade_corridor_enhancement_account,_stf</th>        <th class=\"col_heading level0 col10\" >trade_corridors_improvement_fund</th>        <th class=\"col_heading level0 col11\" >traffic_congestion_relief_fund</th>        <th class=\"col_heading level0 col12\" >transportation_deferred_investment_fund</th>        <th class=\"col_heading level0 col13\" >transportation_faciilities_account</th>        <th class=\"col_heading level0 col14\" >transportation_investment_fund</th>        <th class=\"col_heading level0 col15\" >unknown</th>        <th class=\"col_heading level0 col16\" >total_state_funds</th>        <th class=\"col_heading level0 col17\" >single_phase_cost</th>        <th class=\"col_heading level0 col18\" >single_phase_expenditure_amt</th>        <th class=\"col_heading level0 col19\" >total_federal_funds</th>        <th class=\"col_heading level0 col20\" >is_state</th>        <th class=\"col_heading level0 col21\" >is_federal</th>    </tr></thead><tbody>\n",
-       "                <tr>\n",
-       "                        <th id=\"T_2b285_level0_row0\" class=\"row_heading level0 row0\" >3209</th>\n",
-       "                        <td id=\"T_2b285_row0_col0\" class=\"data row0 col0\" >5944(068)</td>\n",
-       "                        <td id=\"T_2b285_row0_col1\" class=\"data row0 col1\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col2\" class=\"data row0 col2\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col3\" class=\"data row0 col3\" >5412383.390000</td>\n",
-       "                        <td id=\"T_2b285_row0_col4\" class=\"data row0 col4\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col5\" class=\"data row0 col5\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col6\" class=\"data row0 col6\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col7\" class=\"data row0 col7\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col8\" class=\"data row0 col8\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col9\" class=\"data row0 col9\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col10\" class=\"data row0 col10\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col11\" class=\"data row0 col11\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col12\" class=\"data row0 col12\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col13\" class=\"data row0 col13\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col14\" class=\"data row0 col14\" >85000.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col15\" class=\"data row0 col15\" >0.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col16\" class=\"data row0 col16\" >85000.000000</td>\n",
-       "                        <td id=\"T_2b285_row0_col17\" class=\"data row0 col17\" >5497383.390000</td>\n",
-       "                        <td id=\"T_2b285_row0_col18\" class=\"data row0 col18\" >5497383.390000</td>\n",
-       "                        <td id=\"T_2b285_row0_col19\" class=\"data row0 col19\" >5412383.390000</td>\n",
-       "                        <td id=\"T_2b285_row0_col20\" class=\"data row0 col20\" >Yes</td>\n",
-       "                        <td id=\"T_2b285_row0_col21\" class=\"data row0 col21\" >Yes</td>\n",
-       "            </tr>\n",
-       "    </tbody></table>"
-      ],
-      "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x25bd8f64dc8>"
-      ]
-     },
-     "execution_count": 259,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].style.where(lambda val: 'Yes' in str(val), 'color: red')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 260,
-   "id": "f3b59b90",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>adv_project_id</th>\n",
-       "      <th>fund_code</th>\n",
-       "      <th>pec_code</th>\n",
-       "      <th>appropriation_category_code</th>\n",
-       "      <th>curr_bud_am</th>\n",
-       "      <th>cash_exp_am</th>\n",
-       "      <th>pect_task_code</th>\n",
-       "      <th>project_id</th>\n",
-       "      <th>general_fund</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>3804</th>\n",
-       "      <td>0500000588</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>0203</td>\n",
-       "      <td>630485.13</td>\n",
-       "      <td>630485.13</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5944(068)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3805</th>\n",
-       "      <td>0500000588</td>\n",
-       "      <td>3008</td>\n",
-       "      <td>2030600</td>\n",
-       "      <td>0506</td>\n",
-       "      <td>85000.00</td>\n",
-       "      <td>85000.00</td>\n",
-       "      <td>620</td>\n",
-       "      <td>5944(068)</td>\n",
-       "      <td>Transportation Investment Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3806</th>\n",
-       "      <td>0500000588</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>0910</td>\n",
-       "      <td>809514.72</td>\n",
-       "      <td>809514.72</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5944(068)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3807</th>\n",
-       "      <td>0500000588</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1213</td>\n",
-       "      <td>1001729.00</td>\n",
-       "      <td>1001729.00</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5944(068)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3808</th>\n",
-       "      <td>0500000588</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1516</td>\n",
-       "      <td>2970654.54</td>\n",
-       "      <td>2970654.54</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5944(068)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     adv_project_id fund_code pec_code appropriation_category_code  \\\n",
-       "3804     0500000588      0890  2030010                        0203   \n",
-       "3805     0500000588      3008  2030600                        0506   \n",
-       "3806     0500000588      0890  2030010                        0910   \n",
-       "3807     0500000588      0890  2030010                        1213   \n",
-       "3808     0500000588      0890  2030010                        1516   \n",
-       "\n",
-       "      curr_bud_am  cash_exp_am  pect_task_code project_id  \\\n",
-       "3804    630485.13    630485.13             300  5944(068)   \n",
-       "3805     85000.00     85000.00             620  5944(068)   \n",
-       "3806    809514.72    809514.72             300  5944(068)   \n",
-       "3807   1001729.00   1001729.00             300  5944(068)   \n",
-       "3808   2970654.54   2970654.54             300  5944(068)   \n",
-       "\n",
-       "                        general_fund  \n",
-       "3804              Federal Trust Fund  \n",
-       "3805  Transportation Investment Fund  \n",
-       "3806              Federal Trust Fund  \n",
-       "3807              Federal Trust Fund  \n",
-       "3808              Federal Trust Fund  "
-      ]
-     },
-     "execution_count": 260,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "fund_phase_df.loc[fund_phase_df.project_id == '5944(068)']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 261,
-   "id": "c6465aa8",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "curr_bud_am   5412383.39\n",
-       "dtype: float64"
-      ]
-     },
-     "execution_count": 261,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "fund_phase_df.loc[(fund_phase_df.project_id == '5944(068)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 262,
-   "id": "1a4ae929",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "curr_bud_am   5497383.39\n",
-       "dtype: float64"
-      ]
-     },
-     "execution_count": 262,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "fund_phase_df.loc[fund_phase_df.project_id == '5944(068)'][['curr_bud_am']].sum()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 263,
-   "id": "5dcaa987",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "3209   5497383.39\n",
-       "dtype: float64"
-      ]
-     },
-     "execution_count": 263,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].total_state_funds + final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].federal_trust_fund"
+    "final_fund_phase_df2 = final_fund_phase_df[to_keep]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 264,
-   "id": "88142d1f",
-   "metadata": {},
+   "execution_count": 116,
+   "id": "41b0c099",
+   "metadata": {
+    "scrolled": true
+   },
    "outputs": [
     {
      "data": {
@@ -3090,13 +2773,13 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>383</th>\n",
-       "      <td>5006(219)</td>\n",
+       "      <th>55</th>\n",
+       "      <td>18D3(041)</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
-       "      <td>32967253.86</td>\n",
        "      <td>0.00</td>\n",
-       "      <td>229400.00</td>\n",
        "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
@@ -3107,525 +2790,209 @@
        "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
-       "      <td>229400.00</td>\n",
-       "      <td>33196653.86</td>\n",
-       "      <td>32534546.43</td>\n",
-       "      <td>32967253.86</td>\n",
-       "      <td>Yes</td>\n",
-       "      <td>Yes</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "    project_id  corridor_mobility_improvement_account  \\\n",
-       "383  5006(219)                                   0.00   \n",
-       "\n",
-       "     environmental_enhanc_&_mitigat_prgm_fd  federal_trust_fund  \\\n",
-       "383                                    0.00         32967253.86   \n",
-       "\n",
-       "     highway_safety,rehabilitation,&_preservation_acct  \\\n",
-       "383                                               0.00   \n",
-       "\n",
-       "     local_bridge_seismic_retrofit_acct  \\\n",
-       "383                           229400.00   \n",
-       "\n",
-       "     road_maintenance_&_rehabilitation_account,_stf  state_highway_account  \\\n",
-       "383                                            0.00                   0.00   \n",
-       "\n",
-       "     state-local_partnership_program_acct  \\\n",
-       "383                                  0.00   \n",
-       "\n",
-       "     trade_corridor_enhancement_account,_stf  \\\n",
-       "383                                     0.00   \n",
-       "\n",
-       "     trade_corridors_improvement_fund  traffic_congestion_relief_fund  \\\n",
-       "383                              0.00                            0.00   \n",
-       "\n",
-       "     transportation_deferred_investment_fund  \\\n",
-       "383                                     0.00   \n",
-       "\n",
-       "     transportation_faciilities_account  transportation_investment_fund  \\\n",
-       "383                                0.00                            0.00   \n",
-       "\n",
-       "     unknown  total_state_funds  single_phase_cost  \\\n",
-       "383     0.00          229400.00        33196653.86   \n",
-       "\n",
-       "     single_phase_expenditure_amt  total_federal_funds is_state is_federal  \n",
-       "383                   32534546.43          32967253.86      Yes        Yes  "
-      ]
-     },
-     "execution_count": 264,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "final_fund_phase_df.loc[final_fund_phase_df.project_id == '5006(219)']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 265,
-   "id": "61f4652c",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "curr_bud_am   32967253.86\n",
-       "dtype: float64"
-      ]
-     },
-     "execution_count": 265,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "fund_phase_df.loc[(fund_phase_df.project_id == '5006(219)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 266,
-   "id": "72e610b6",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "33196653.86"
-      ]
-     },
-     "execution_count": 266,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "229400.00 + 32967253.86"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 267,
-   "id": "9c4e34c0",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>adv_project_id</th>\n",
-       "      <th>fund_code</th>\n",
-       "      <th>pec_code</th>\n",
-       "      <th>appropriation_category_code</th>\n",
-       "      <th>curr_bud_am</th>\n",
-       "      <th>cash_exp_am</th>\n",
-       "      <th>pect_task_code</th>\n",
-       "      <th>project_id</th>\n",
-       "      <th>general_fund</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>5734</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>0506</td>\n",
-       "      <td>1000000.00</td>\n",
-       "      <td>1000000.00</td>\n",
-       "      <td>810</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5735</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1415</td>\n",
+       "      <th>57</th>\n",
+       "      <td>2006(034)</td>\n",
        "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5736</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>0809</td>\n",
-       "      <td>1691542.00</td>\n",
-       "      <td>1691542.00</td>\n",
-       "      <td>810</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5737</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>25448.42</td>\n",
-       "      <td>25448.42</td>\n",
-       "      <td>810</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5738</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1112</td>\n",
-       "      <td>20206009.54</td>\n",
-       "      <td>20206009.54</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5739</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1314</td>\n",
-       "      <td>3216979.12</td>\n",
-       "      <td>3216979.12</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5740</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1415</td>\n",
-       "      <td>154672.27</td>\n",
-       "      <td>154672.27</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5741</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1617</td>\n",
-       "      <td>608787.00</td>\n",
-       "      <td>294068.82</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5742</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1920</td>\n",
-       "      <td>1876299.00</td>\n",
-       "      <td>1653630.55</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5743</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1819</td>\n",
-       "      <td>106000.00</td>\n",
-       "      <td>94331.53</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5744</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1819</td>\n",
-       "      <td>626000.00</td>\n",
-       "      <td>512947.67</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5745</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>20985.52</td>\n",
-       "      <td>20985.52</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5746</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>80000.00</td>\n",
-       "      <td>80000.00</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5747</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>210932.42</td>\n",
-       "      <td>210932.42</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5748</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>1338648.66</td>\n",
-       "      <td>1338648.66</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5749</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>19384.57</td>\n",
-       "      <td>19384.57</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5750</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1415</td>\n",
-       "      <td>39345.36</td>\n",
-       "      <td>39345.36</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5751</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1112</td>\n",
-       "      <td>516666.98</td>\n",
-       "      <td>516666.98</td>\n",
-       "      <td>810</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5752</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1314</td>\n",
        "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
-       "      <td>810</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>5753</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1112</td>\n",
        "      <td>0.00</td>\n",
        "      <td>0.00</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5754</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1213</td>\n",
-       "      <td>392119.00</td>\n",
-       "      <td>392119.00</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
+       "      <th>58</th>\n",
+       "      <td>2006(048)</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5755</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1415</td>\n",
-       "      <td>837434.00</td>\n",
-       "      <td>837434.00</td>\n",
-       "      <td>300</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Federal Trust Fund</td>\n",
+       "      <th>59</th>\n",
+       "      <td>2006(049)</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>5756</th>\n",
-       "      <td>0700001158</td>\n",
-       "      <td>6062</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1112</td>\n",
-       "      <td>229400.00</td>\n",
-       "      <td>229400.00</td>\n",
-       "      <td>690</td>\n",
-       "      <td>5006(219)</td>\n",
-       "      <td>Local Bridge Seismic Retrofit Acct</td>\n",
+       "      <th>60</th>\n",
+       "      <td>2006(053)</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>No</td>\n",
+       "      <td>No</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "     adv_project_id fund_code pec_code appropriation_category_code  \\\n",
-       "5734     0700001158      0890  2030010                        0506   \n",
-       "5735     0700001158      0890  2030010                        1415   \n",
-       "5736     0700001158      0890  2030010                        0809   \n",
-       "5737     0700001158      0890  2030010                        1011   \n",
-       "5738     0700001158      0890  2030010                        1112   \n",
-       "5739     0700001158      0890  2030010                        1314   \n",
-       "5740     0700001158      0890  2030010                        1415   \n",
-       "5741     0700001158      0890  2030010                        1617   \n",
-       "5742     0700001158      0890  2030010                        1920   \n",
-       "5743     0700001158      0890  2030010                        1819   \n",
-       "5744     0700001158      0890  2030010                        1819   \n",
-       "5745     0700001158      0890  2030010                        1011   \n",
-       "5746     0700001158      0890  2030010                        1011   \n",
-       "5747     0700001158      0890  2030010                        1011   \n",
-       "5748     0700001158      0890  2030010                        1011   \n",
-       "5749     0700001158      0890  2030010                        1011   \n",
-       "5750     0700001158      0890  2030010                        1415   \n",
-       "5751     0700001158      0890  2030010                        1112   \n",
-       "5752     0700001158      0890  2030010                        1314   \n",
-       "5753     0700001158      0890  2030010                        1112   \n",
-       "5754     0700001158      0890  2030010                        1213   \n",
-       "5755     0700001158      0890  2030010                        1415   \n",
-       "5756     0700001158      6062  2030010                        1112   \n",
-       "\n",
-       "      curr_bud_am  cash_exp_am  pect_task_code project_id  \\\n",
-       "5734   1000000.00   1000000.00             810  5006(219)   \n",
-       "5735         0.00         0.00             300  5006(219)   \n",
-       "5736   1691542.00   1691542.00             810  5006(219)   \n",
-       "5737     25448.42     25448.42             810  5006(219)   \n",
-       "5738  20206009.54  20206009.54             300  5006(219)   \n",
-       "5739   3216979.12   3216979.12             300  5006(219)   \n",
-       "5740    154672.27    154672.27             300  5006(219)   \n",
-       "5741    608787.00    294068.82             300  5006(219)   \n",
-       "5742   1876299.00   1653630.55             300  5006(219)   \n",
-       "5743    106000.00     94331.53             300  5006(219)   \n",
-       "5744    626000.00    512947.67             300  5006(219)   \n",
-       "5745     20985.52     20985.52             300  5006(219)   \n",
-       "5746     80000.00     80000.00             300  5006(219)   \n",
-       "5747    210932.42    210932.42             300  5006(219)   \n",
-       "5748   1338648.66   1338648.66             300  5006(219)   \n",
-       "5749     19384.57     19384.57             300  5006(219)   \n",
-       "5750     39345.36     39345.36             300  5006(219)   \n",
-       "5751    516666.98    516666.98             810  5006(219)   \n",
-       "5752         0.00         0.00             810  5006(219)   \n",
-       "5753         0.00         0.00             300  5006(219)   \n",
-       "5754    392119.00    392119.00             300  5006(219)   \n",
-       "5755    837434.00    837434.00             300  5006(219)   \n",
-       "5756    229400.00    229400.00             690  5006(219)   \n",
-       "\n",
-       "                            general_fund  \n",
-       "5734                  Federal Trust Fund  \n",
-       "5735                  Federal Trust Fund  \n",
-       "5736                  Federal Trust Fund  \n",
-       "5737                  Federal Trust Fund  \n",
-       "5738                  Federal Trust Fund  \n",
-       "5739                  Federal Trust Fund  \n",
-       "5740                  Federal Trust Fund  \n",
-       "5741                  Federal Trust Fund  \n",
-       "5742                  Federal Trust Fund  \n",
-       "5743                  Federal Trust Fund  \n",
-       "5744                  Federal Trust Fund  \n",
-       "5745                  Federal Trust Fund  \n",
-       "5746                  Federal Trust Fund  \n",
-       "5747                  Federal Trust Fund  \n",
-       "5748                  Federal Trust Fund  \n",
-       "5749                  Federal Trust Fund  \n",
-       "5750                  Federal Trust Fund  \n",
-       "5751                  Federal Trust Fund  \n",
-       "5752                  Federal Trust Fund  \n",
-       "5753                  Federal Trust Fund  \n",
-       "5754                  Federal Trust Fund  \n",
-       "5755                  Federal Trust Fund  \n",
-       "5756  Local Bridge Seismic Retrofit Acct  "
+       "   project_id  corridor_mobility_improvement_account  \\\n",
+       "55  18D3(041)                                   0.00   \n",
+       "57  2006(034)                                   0.00   \n",
+       "58  2006(048)                                   0.00   \n",
+       "59  2006(049)                                   0.00   \n",
+       "60  2006(053)                                   0.00   \n",
+       "\n",
+       "    environmental_enhanc_&_mitigat_prgm_fd  federal_trust_fund  \\\n",
+       "55                                    0.00                0.00   \n",
+       "57                                    0.00                0.00   \n",
+       "58                                    0.00                0.00   \n",
+       "59                                    0.00                0.00   \n",
+       "60                                    0.00                0.00   \n",
+       "\n",
+       "    highway_safety,rehabilitation,&_preservation_acct  \\\n",
+       "55                                               0.00   \n",
+       "57                                               0.00   \n",
+       "58                                               0.00   \n",
+       "59                                               0.00   \n",
+       "60                                               0.00   \n",
+       "\n",
+       "    local_bridge_seismic_retrofit_acct  \\\n",
+       "55                                0.00   \n",
+       "57                                0.00   \n",
+       "58                                0.00   \n",
+       "59                                0.00   \n",
+       "60                                0.00   \n",
+       "\n",
+       "    road_maintenance_&_rehabilitation_account,_stf  state_highway_account  \\\n",
+       "55                                            0.00                   0.00   \n",
+       "57                                            0.00                   0.00   \n",
+       "58                                            0.00                   0.00   \n",
+       "59                                            0.00                   0.00   \n",
+       "60                                            0.00                   0.00   \n",
+       "\n",
+       "    state-local_partnership_program_acct  \\\n",
+       "55                                  0.00   \n",
+       "57                                  0.00   \n",
+       "58                                  0.00   \n",
+       "59                                  0.00   \n",
+       "60                                  0.00   \n",
+       "\n",
+       "    trade_corridor_enhancement_account,_stf  trade_corridors_improvement_fund  \\\n",
+       "55                                     0.00                              0.00   \n",
+       "57                                     0.00                              0.00   \n",
+       "58                                     0.00                              0.00   \n",
+       "59                                     0.00                              0.00   \n",
+       "60                                     0.00                              0.00   \n",
+       "\n",
+       "    traffic_congestion_relief_fund  transportation_deferred_investment_fund  \\\n",
+       "55                            0.00                                     0.00   \n",
+       "57                            0.00                                     0.00   \n",
+       "58                            0.00                                     0.00   \n",
+       "59                            0.00                                     0.00   \n",
+       "60                            0.00                                     0.00   \n",
+       "\n",
+       "    transportation_faciilities_account  transportation_investment_fund  \\\n",
+       "55                                0.00                            0.00   \n",
+       "57                                0.00                            0.00   \n",
+       "58                                0.00                            0.00   \n",
+       "59                                0.00                            0.00   \n",
+       "60                                0.00                            0.00   \n",
+       "\n",
+       "    unknown  total_state_funds  single_phase_cost  \\\n",
+       "55     0.00               0.00               0.00   \n",
+       "57     0.00               0.00               0.00   \n",
+       "58     0.00               0.00               0.00   \n",
+       "59     0.00               0.00               0.00   \n",
+       "60     0.00               0.00               0.00   \n",
+       "\n",
+       "    single_phase_expenditure_amt  total_federal_funds is_state is_federal  \n",
+       "55                          0.00                 0.00       No         No  \n",
+       "57                          0.00                 0.00       No         No  \n",
+       "58                          0.00                 0.00       No         No  \n",
+       "59                          0.00                 0.00       No         No  \n",
+       "60                          0.00                 0.00       No         No  "
       ]
      },
-     "execution_count": 267,
+     "execution_count": 116,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "fund_phase_df.loc[fund_phase_df.project_id ==  '5006(219)']"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "7f0ddf53",
-   "metadata": {},
-   "source": [
-    "## Awards Table\n",
-    "* Appropriation code is the fiscal year of award\n"
+    "final_fund_phase_df.loc[(final_fund_phase_df.is_state == \"No\") & (final_fund_phase_df.is_federal == \"No\")].head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 268,
-   "id": "f6e5a5a7",
+   "execution_count": 117,
+   "id": "2f222fad",
    "metadata": {},
    "outputs": [
     {
@@ -3649,185 +3016,189 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>adv_project_id</th>\n",
-       "      <th>fund_code</th>\n",
-       "      <th>pec_code</th>\n",
-       "      <th>appropriation_category_code</th>\n",
-       "      <th>curr_bud_am</th>\n",
-       "      <th>cash_exp_am</th>\n",
-       "      <th>pect_task_code</th>\n",
+       "      <th></th>\n",
        "      <th>project_id</th>\n",
-       "      <th>pec</th>\n",
-       "      <th>pect</th>\n",
-       "      <th>pect_description</th>\n",
-       "      <th>program</th>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>is_state</th>\n",
+       "      <th>is_federal</th>\n",
+       "      <th></th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>858</th>\n",
-       "      <td>0214000121</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1617</td>\n",
-       "      <td>762938.00</td>\n",
-       "      <td>435821.60</td>\n",
-       "      <td>560</td>\n",
-       "      <td>5905(099)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>560.00</td>\n",
-       "      <td>High Risk Rural Roads Program (HR3)</td>\n",
-       "      <td>Local Assistance</td>\n",
+       "      <th rowspan=\"2\" valign=\"top\">No</th>\n",
+       "      <th>No</th>\n",
+       "      <td>448</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Yes</th>\n",
+       "      <td>2500</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th rowspan=\"2\" valign=\"top\">Yes</th>\n",
+       "      <th>No</th>\n",
+       "      <td>1247</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>Yes</th>\n",
+       "      <td>135</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "    adv_project_id fund_code pec_code appropriation_category_code  \\\n",
-       "858     0214000121      0890  2030010                        1617   \n",
-       "\n",
-       "     curr_bud_am  cash_exp_am  pect_task_code project_id      pec   pect  \\\n",
-       "858    762938.00    435821.60             560  5905(099)  2030010 560.00   \n",
-       "\n",
-       "                        pect_description           program  \n",
-       "858  High Risk Rural Roads Program (HR3)  Local Assistance  "
+       "                     project_id\n",
+       "is_state is_federal            \n",
+       "No       No                 448\n",
+       "         Yes               2500\n",
+       "Yes      No                1247\n",
+       "         Yes                135"
       ]
      },
-     "execution_count": 268,
+     "execution_count": 117,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "pect_df.sample()"
+    "final_fund_phase_df.groupby(['is_state', 'is_federal']).agg({'project_id':'nunique'})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "31cc5696",
+   "metadata": {},
+   "source": [
+    "### Double Checking\n",
+    "* Make sure the project flag is correct"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 269,
-   "id": "51f10795",
+   "execution_count": 118,
+   "id": "e71ef88d",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>adv_project_id</th>\n",
-       "      <th>fund_code</th>\n",
-       "      <th>pec_code</th>\n",
-       "      <th>appropriation_category_code</th>\n",
-       "      <th>curr_bud_am</th>\n",
-       "      <th>cash_exp_am</th>\n",
-       "      <th>pect_task_code</th>\n",
-       "      <th>project_id</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>5928</th>\n",
-       "      <td>0700020294</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1112</td>\n",
-       "      <td>554663.00</td>\n",
-       "      <td>554663.00</td>\n",
-       "      <td>690</td>\n",
-       "      <td>5953(650)</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "     adv_project_id fund_code pec_code appropriation_category_code  \\\n",
-       "5928     0700020294      0890  2030010                        1112   \n",
-       "\n",
-       "      curr_bud_am  cash_exp_am  pect_task_code project_id  \n",
-       "5928    554663.00    554663.00             690  5953(650)  "
-      ]
-     },
-     "execution_count": 269,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "accounting_df.sample()"
+    "# fund_phase_df.project_id.value_counts().head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 270,
-   "id": "e33565a3",
+   "execution_count": 119,
+   "id": "87447a97",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Only want the most recent year of a pec_code listed once\n",
-    "awards_df = (pect_df\n",
-    "                  .groupby(['project_id', 'program'])\n",
-    "                  .agg({'appropriation_category_code':'max'})\n",
-    "                  .reset_index()\n",
-    "                  .rename(columns = {'appropriation_category_code':'state_fiscal_awarded_year',\n",
-    "                                     'program':'grant_program'})\n",
-    "                  )"
+    "# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].style.where(lambda val: 'Yes' in str(val), 'color: red')"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "edbc2079",
+   "cell_type": "code",
+   "execution_count": 120,
+   "id": "ff83f49b",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Checks"
+    "# fund_phase_df.loc[fund_phase_df.project_id == '5944(068)']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 271,
-   "id": "bd775031",
+   "execution_count": 121,
+   "id": "f8a7cc78",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fund_phase_df.loc[(fund_phase_df.project_id == '5944(068)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "id": "f8d7e6c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fund_phase_df.loc[fund_phase_df.project_id == '5944(068)'][['curr_bud_am']].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "id": "ef5bdbcc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].total_state_funds + final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].federal_trust_fund"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 124,
+   "id": "bbce5cbd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5006(219)']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 125,
+   "id": "f61fbd0b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fund_phase_df.loc[(fund_phase_df.project_id == '5006(219)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "id": "9eb9c539",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "5182(058)    3\n",
-       "5288(046)    3\n",
-       "5475(038)    3\n",
-       "6066(140)    3\n",
-       "6090(059)    3\n",
-       "Name: project_id, dtype: int64"
+       "33196653.86"
       ]
      },
-     "execution_count": 271,
+     "execution_count": 126,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "awards_df.project_id.value_counts().head()"
+    "229400.00 + 32967253.86"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 127,
+   "id": "a12dbb0c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# fund_phase_df.loc[fund_phase_df.project_id ==  '5006(219)']"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "46c290bd",
+   "metadata": {},
+   "source": [
+    "## Awards Table\n",
+    "* Appropriation code is the fiscal year of award\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 272,
-   "id": "d0b1444e",
+   "execution_count": 128,
+   "id": "8d2670d1",
    "metadata": {},
    "outputs": [
     {
@@ -3851,64 +3222,67 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
+       "      <th>adv_project_id</th>\n",
+       "      <th>fund_code</th>\n",
+       "      <th>pec_code</th>\n",
+       "      <th>appropriation_category_code</th>\n",
+       "      <th>curr_bud_am</th>\n",
+       "      <th>cash_exp_am</th>\n",
+       "      <th>pect_task_code</th>\n",
        "      <th>project_id</th>\n",
-       "      <th>grant_program</th>\n",
-       "      <th>state_fiscal_awarded_year</th>\n",
+       "      <th>pec</th>\n",
+       "      <th>pect</th>\n",
+       "      <th>pect_description</th>\n",
+       "      <th>program</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>1546</th>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>Active Transportation Program (ATP)</td>\n",
-       "      <td>2223</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1547</th>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>Local Assistance</td>\n",
-       "      <td>2223</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1548</th>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017</td>\n",
-       "      <td>2122</td>\n",
+       "      <th>8504</th>\n",
+       "      <td>1013000080</td>\n",
+       "      <td>0042</td>\n",
+       "      <td>2030600</td>\n",
+       "      <td>1213</td>\n",
+       "      <td>25000.00</td>\n",
+       "      <td>25000.00</td>\n",
+       "      <td>621</td>\n",
+       "      <td>5940(103)</td>\n",
+       "      <td>2030600</td>\n",
+       "      <td>621.00</td>\n",
+       "      <td>Local Roads Rehabilitation</td>\n",
+       "      <td>State Transportation Improvement Program (STIP)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "     project_id  \\\n",
-       "1546  5182(058)   \n",
-       "1547  5182(058)   \n",
-       "1548  5182(058)   \n",
+       "     adv_project_id fund_code pec_code appropriation_category_code  \\\n",
+       "8504     1013000080      0042  2030600                        1213   \n",
        "\n",
-       "                                                                                                                                                  grant_program  \\\n",
-       "1546                                                                                                                        Active Transportation Program (ATP)   \n",
-       "1547                                                                                                                                           Local Assistance   \n",
-       "1548  Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017   \n",
+       "      curr_bud_am  cash_exp_am  pect_task_code project_id      pec   pect  \\\n",
+       "8504     25000.00     25000.00             621  5940(103)  2030600 621.00   \n",
        "\n",
-       "     state_fiscal_awarded_year  \n",
-       "1546                      2223  \n",
-       "1547                      2223  \n",
-       "1548                      2122  "
+       "                pect_description  \\\n",
+       "8504  Local Roads Rehabilitation   \n",
+       "\n",
+       "                                              program  \n",
+       "8504  State Transportation Improvement Program (STIP)  "
       ]
      },
-     "execution_count": 272,
+     "execution_count": 128,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "awards_df.loc[awards_df.project_id == \"5182(058)\"]"
+    "pect_df.sample()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 273,
-   "id": "c7feef8d",
+   "execution_count": 129,
+   "id": "d48c1f0f",
    "metadata": {},
    "outputs": [
     {
@@ -3940,132 +3314,19 @@
        "      <th>cash_exp_am</th>\n",
        "      <th>pect_task_code</th>\n",
        "      <th>project_id</th>\n",
-       "      <th>pec</th>\n",
-       "      <th>pect</th>\n",
-       "      <th>pect_description</th>\n",
-       "      <th>program</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>1412</th>\n",
-       "      <td>0312000145</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030720</td>\n",
-       "      <td>2223</td>\n",
-       "      <td>4318000.00</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>100</td>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>2030720</td>\n",
-       "      <td>100.00</td>\n",
-       "      <td>Active Transportation Program (ATP)</td>\n",
-       "      <td>Active Transportation Program (ATP)</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1413</th>\n",
-       "      <td>0312000145</td>\n",
-       "      <td>0042</td>\n",
-       "      <td>2030210</td>\n",
-       "      <td>2122</td>\n",
-       "      <td>6239000.00</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>350</td>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>2030210</td>\n",
-       "      <td>350.00</td>\n",
-       "      <td>Solutions for Congested Corridors Program (SCCP)</td>\n",
-       "      <td>Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1414</th>\n",
-       "      <td>0312000145</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1011</td>\n",
-       "      <td>456704.00</td>\n",
-       "      <td>456704.00</td>\n",
-       "      <td>820</td>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>820.00</td>\n",
-       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
-       "      <td>Local Assistance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1415</th>\n",
-       "      <td>0312000145</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1112</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>820</td>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>820.00</td>\n",
-       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
-       "      <td>Local Assistance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1416</th>\n",
-       "      <td>0312000145</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1213</td>\n",
-       "      <td>54423.24</td>\n",
-       "      <td>54423.24</td>\n",
-       "      <td>820</td>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>820.00</td>\n",
-       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
-       "      <td>Local Assistance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1417</th>\n",
-       "      <td>0312000145</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1920</td>\n",
-       "      <td>50000.00</td>\n",
-       "      <td>13000.00</td>\n",
-       "      <td>820</td>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>820.00</td>\n",
-       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
-       "      <td>Local Assistance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1418</th>\n",
-       "      <td>0312000145</td>\n",
+       "      <th>6581</th>\n",
+       "      <td>0718000255</td>\n",
        "      <td>0890</td>\n",
        "      <td>2030010</td>\n",
-       "      <td>2223</td>\n",
-       "      <td>333821.00</td>\n",
+       "      <td>2122</td>\n",
+       "      <td>1238310.00</td>\n",
        "      <td>0.00</td>\n",
-       "      <td>820</td>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>820.00</td>\n",
-       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
-       "      <td>Local Assistance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1419</th>\n",
-       "      <td>0312000145</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1516</td>\n",
-       "      <td>34991.76</td>\n",
-       "      <td>34991.76</td>\n",
-       "      <td>820</td>\n",
-       "      <td>5182(058)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>820.00</td>\n",
-       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
-       "      <td>Local Assistance</td>\n",
+       "      <td>550</td>\n",
+       "      <td>5257(037)</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
@@ -4073,159 +3334,76 @@
       ],
       "text/plain": [
        "     adv_project_id fund_code pec_code appropriation_category_code  \\\n",
-       "1412     0312000145      0890  2030720                        2223   \n",
-       "1413     0312000145      0042  2030210                        2122   \n",
-       "1414     0312000145      0890  2030010                        1011   \n",
-       "1415     0312000145      0890  2030010                        1112   \n",
-       "1416     0312000145      0890  2030010                        1213   \n",
-       "1417     0312000145      0890  2030010                        1920   \n",
-       "1418     0312000145      0890  2030010                        2223   \n",
-       "1419     0312000145      0890  2030010                        1516   \n",
-       "\n",
-       "      curr_bud_am  cash_exp_am  pect_task_code project_id      pec   pect  \\\n",
-       "1412   4318000.00         0.00             100  5182(058)  2030720 100.00   \n",
-       "1413   6239000.00         0.00             350  5182(058)  2030210 350.00   \n",
-       "1414    456704.00    456704.00             820  5182(058)  2030010 820.00   \n",
-       "1415         0.00         0.00             820  5182(058)  2030010 820.00   \n",
-       "1416     54423.24     54423.24             820  5182(058)  2030010 820.00   \n",
-       "1417     50000.00     13000.00             820  5182(058)  2030010 820.00   \n",
-       "1418    333821.00         0.00             820  5182(058)  2030010 820.00   \n",
-       "1419     34991.76     34991.76             820  5182(058)  2030010 820.00   \n",
-       "\n",
-       "                                                    pect_description  \\\n",
-       "1412                             Active Transportation Program (ATP)   \n",
-       "1413                Solutions for Congested Corridors Program (SCCP)   \n",
-       "1414  Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
-       "1415  Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
-       "1416  Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
-       "1417  Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
-       "1418  Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
-       "1419  Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
+       "6581     0718000255      0890  2030010                        2122   \n",
        "\n",
-       "                                                                                                                                                        program  \n",
-       "1412                                                                                                                        Active Transportation Program (ATP)  \n",
-       "1413  Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017  \n",
-       "1414                                                                                                                                           Local Assistance  \n",
-       "1415                                                                                                                                           Local Assistance  \n",
-       "1416                                                                                                                                           Local Assistance  \n",
-       "1417                                                                                                                                           Local Assistance  \n",
-       "1418                                                                                                                                           Local Assistance  \n",
-       "1419                                                                                                                                           Local Assistance  "
+       "      curr_bud_am  cash_exp_am  pect_task_code project_id  \n",
+       "6581   1238310.00         0.00             550  5257(037)  "
       ]
      },
-     "execution_count": 273,
+     "execution_count": 129,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "# Check original df \n",
-    "pect_df.loc[pect_df.project_id == \"5182(058)\"]"
+    "accounting_df.sample()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 274,
-   "id": "0bbac42d",
+   "execution_count": 130,
+   "id": "2c9cca16",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Only want the most recent year of a pec_code listed once\n",
+    "awards_df = (pect_df\n",
+    "                  .groupby(['project_id', 'program'])\n",
+    "                  .agg({'appropriation_category_code':'max'})\n",
+    "                  .reset_index()\n",
+    "                  .rename(columns = {'appropriation_category_code':'state_fiscal_awarded_year',\n",
+    "                                     'program':'grant_program'})\n",
+    "                  )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3de7091b",
+   "metadata": {},
+   "source": [
+    "## Checks"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "id": "b7c63b95",
    "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<style  type=\"text/css\" >\n",
-       "#T_7f970_row0_col40,#T_7f970_row0_col41,#T_7f970_row0_col47{\n",
-       "            color:  red;\n",
-       "        }</style><table id=\"T_7f970_\" ><thead>    <tr>        <th class=\"blank level0\" ></th>        <th class=\"col_heading level0 col0\" >project_id</th>        <th class=\"col_heading level0 col1\" >comment_desc</th>        <th class=\"col_heading level0 col2\" >district_code</th>        <th class=\"col_heading level0 col3\" >est_total_prj_costs</th>        <th class=\"col_heading level0 col4\" >location_name</th>        <th class=\"col_heading level0 col5\" >project_label_name</th>        <th class=\"col_heading level0 col6\" >original_post_mile_begin_id</th>        <th class=\"col_heading level0 col7\" >original_post_mile_end_id</th>        <th class=\"col_heading level0 col8\" >revised_post_mile_begin_ind</th>        <th class=\"col_heading level0 col9\" >revised_post_mile_end_ind</th>        <th class=\"col_heading level0 col10\" >route_name</th>        <th class=\"col_heading level0 col11\" >state_hwy_ind</th>        <th class=\"col_heading level0 col12\" >senate_district_code</th>        <th class=\"col_heading level0 col13\" >update_date_time</th>        <th class=\"col_heading level0 col14\" >agency_name</th>        <th class=\"col_heading level0 col15\" >urban_area_code</th>        <th class=\"col_heading level0 col16\" >county_name</th>        <th class=\"col_heading level0 col17\" >work_type_desc</th>        <th class=\"col_heading level0 col18\" >category_desc</th>        <th class=\"col_heading level0 col19\" >current_phase</th>        <th class=\"col_heading level0 col20\" >active_transportation_program_(atp)</th>        <th class=\"col_heading level0 col21\" >bridge_inspection_&_scour_evaluation</th>        <th class=\"col_heading level0 col22\" >covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation</th>        <th class=\"col_heading level0 col23\" >carbon_reduction_program_(crp)</th>        <th class=\"col_heading level0 col24\" >congestion_mitigation_&_air_quality_improvement_program_(cmaq)</th>        <th class=\"col_heading level0 col25\" >coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds</th>        <th class=\"col_heading level0 col26\" >corridor_mobility_improvement_account_(cmia)_program</th>        <th class=\"col_heading level0 col27\" >county_exchange_funds</th>        <th class=\"col_heading level0 col28\" >county_state_match_program</th>        <th class=\"col_heading level0 col29\" >earmarks_projects_(hpp,_demo_cpfcds,_etc.)</th>        <th class=\"col_heading level0 col30\" >emergency_relief_(er)</th>        <th class=\"col_heading level0 col31\" >ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program</th>        <th class=\"col_heading level0 col32\" >funds_for_planning,_programming_and_monitoring_-_rip</th>        <th class=\"col_heading level0 col33\" >general_funded_designated_programs</th>        <th class=\"col_heading level0 col34\" >hazard_elimination_safety_(hes)</th>        <th class=\"col_heading level0 col35\" >high_risk_rural_roads_program_(hr3)</th>        <th class=\"col_heading level0 col36\" >highway_bridge_</th>        <th class=\"col_heading level0 col37\" >highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund</th>        <th class=\"col_heading level0 col38\" >highway_safety_improvement_program_(hsip)_(non-infrastructure)</th>        <th class=\"col_heading level0 col39\" >highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund</th>        <th class=\"col_heading level0 col40\" >local_partnership_program_(lpp_–_competitive)_</th>        <th class=\"col_heading level0 col41\" >local_roads</th>        <th class=\"col_heading level0 col42\" >local_roads_rehabilitation</th>        <th class=\"col_heading level0 col43\" >railroad_grade_crossing_protection</th>        <th class=\"col_heading level0 col44\" >railroad_grade_separations</th>        <th class=\"col_heading level0 col45\" >rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_</th>        <th class=\"col_heading level0 col46\" >regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system)</th>        <th class=\"col_heading level0 col47\" >regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip)</th>        <th class=\"col_heading level0 col48\" >regional_transportation_planning_agency_(rtpa)_stp_match_exchange</th>        <th class=\"col_heading level0 col49\" >sb1_funded_freeway_service_patrol</th>        <th class=\"col_heading level0 col50\" >shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds</th>        <th class=\"col_heading level0 col51\" >safe_routes_to_school_(sr2s_and_srts)</th>        <th class=\"col_heading level0 col52\" >set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act</th>        <th class=\"col_heading level0 col53\" >solutions_for_congested_corridors_program_(sccp)</th>        <th class=\"col_heading level0 col54\" >special_programs</th>        <th class=\"col_heading level0 col55\" >state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic)</th>        <th class=\"col_heading level0 col56\" >structures_seismic_retrofit_</th>        <th class=\"col_heading level0 col57\" >trade_corridor_enhancement_account_(tcea)_programs_–_local_share</th>        <th class=\"col_heading level0 col58\" >trade_corridor_enhancement_account_(tcea)_programs_–_state_share</th>        <th class=\"col_heading level0 col59\" >trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads</th>        <th class=\"col_heading level0 col60\" >traffic_congestion_relief_program_(_tcrp_)</th>        <th class=\"col_heading level0 col61\" >unknown</th>    </tr></thead><tbody>\n",
-       "                <tr>\n",
-       "                        <th id=\"T_7f970_level0_row0\" class=\"row_heading level0 row0\" >7834</th>\n",
-       "                        <td id=\"T_7f970_row0_col0\" class=\"data row0 col0\" >5288(046)</td>\n",
-       "                        <td id=\"T_7f970_row0_col1\" class=\"data row0 col1\" >Data Migrated from CTIPS :\r\n",
-       "The Project Planning Id are: 1785; \r\n",
-       "The locations are :In Folsom on White Rock Road in the vicinity of the Scott Road Intersection. Widen 1 mile of 4-lane roadway and signalize 1 Intersection.;\n",
-       "2/13/2020:  This project is the same as STPL-6498(003).  Agency is determining whether CMGC negotiations will be viable via the JPA and if not, project will be turned over to City of Folsom to implement/construct.  CR\n",
-       "2/13/2020:  This project is the same as \n",
-       "Project has $10,000 LPP and $15,000 RIP/STIP.\n",
-       "8/22/22:  Cost adj to correct local funds to local AC $6,201,500.  Erroneously entered as local funds in prior sequence.</td>\n",
-       "                        <td id=\"T_7f970_row0_col2\" class=\"data row0 col2\" >03</td>\n",
-       "                        <td id=\"T_7f970_row0_col3\" class=\"data row0 col3\" >25750000.000000</td>\n",
-       "                        <td id=\"T_7f970_row0_col4\" class=\"data row0 col4\" >In City of Folsom, on White Rock Road from Prairie City Road to East Bidwell Street.</td>\n",
-       "                        <td id=\"T_7f970_row0_col5\" class=\"data row0 col5\" >Construct 4 lane road with 8 foot shoulders</td>\n",
-       "                        <td id=\"T_7f970_row0_col6\" class=\"data row0 col6\" >nan</td>\n",
-       "                        <td id=\"T_7f970_row0_col7\" class=\"data row0 col7\" >nan</td>\n",
-       "                        <td id=\"T_7f970_row0_col8\" class=\"data row0 col8\" >None</td>\n",
-       "                        <td id=\"T_7f970_row0_col9\" class=\"data row0 col9\" >None</td>\n",
-       "                        <td id=\"T_7f970_row0_col10\" class=\"data row0 col10\" >0-FOL</td>\n",
-       "                        <td id=\"T_7f970_row0_col11\" class=\"data row0 col11\" >N</td>\n",
-       "                        <td id=\"T_7f970_row0_col12\" class=\"data row0 col12\" >None</td>\n",
-       "                        <td id=\"T_7f970_row0_col13\" class=\"data row0 col13\" >2023-10-27 10:33:06</td>\n",
-       "                        <td id=\"T_7f970_row0_col14\" class=\"data row0 col14\" >Folsom</td>\n",
-       "                        <td id=\"T_7f970_row0_col15\" class=\"data row0 col15\" >3067</td>\n",
-       "                        <td id=\"T_7f970_row0_col16\" class=\"data row0 col16\" >Sacramento County</td>\n",
-       "                        <td id=\"T_7f970_row0_col17\" class=\"data row0 col17\" >None</td>\n",
-       "                        <td id=\"T_7f970_row0_col18\" class=\"data row0 col18\" >Roadway Widening</td>\n",
-       "                        <td id=\"T_7f970_row0_col19\" class=\"data row0 col19\" >single phase</td>\n",
-       "                        <td id=\"T_7f970_row0_col20\" class=\"data row0 col20\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col21\" class=\"data row0 col21\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col22\" class=\"data row0 col22\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col23\" class=\"data row0 col23\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col24\" class=\"data row0 col24\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col25\" class=\"data row0 col25\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col26\" class=\"data row0 col26\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col27\" class=\"data row0 col27\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col28\" class=\"data row0 col28\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col29\" class=\"data row0 col29\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col30\" class=\"data row0 col30\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col31\" class=\"data row0 col31\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col32\" class=\"data row0 col32\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col33\" class=\"data row0 col33\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col34\" class=\"data row0 col34\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col35\" class=\"data row0 col35\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col36\" class=\"data row0 col36\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col37\" class=\"data row0 col37\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col38\" class=\"data row0 col38\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col39\" class=\"data row0 col39\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col40\" class=\"data row0 col40\" >Yes</td>\n",
-       "                        <td id=\"T_7f970_row0_col41\" class=\"data row0 col41\" >Yes</td>\n",
-       "                        <td id=\"T_7f970_row0_col42\" class=\"data row0 col42\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col43\" class=\"data row0 col43\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col44\" class=\"data row0 col44\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col45\" class=\"data row0 col45\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col46\" class=\"data row0 col46\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col47\" class=\"data row0 col47\" >Yes</td>\n",
-       "                        <td id=\"T_7f970_row0_col48\" class=\"data row0 col48\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col49\" class=\"data row0 col49\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col50\" class=\"data row0 col50\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col51\" class=\"data row0 col51\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col52\" class=\"data row0 col52\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col53\" class=\"data row0 col53\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col54\" class=\"data row0 col54\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col55\" class=\"data row0 col55\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col56\" class=\"data row0 col56\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col57\" class=\"data row0 col57\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col58\" class=\"data row0 col58\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col59\" class=\"data row0 col59\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col60\" class=\"data row0 col60\" >No</td>\n",
-       "                        <td id=\"T_7f970_row0_col61\" class=\"data row0 col61\" >No</td>\n",
-       "            </tr>\n",
-       "    </tbody></table>"
-      ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x25bd8d9bb88>"
+       "6066(140)    3\n",
+       "5938(233)    3\n",
+       "5956(221)    3\n",
+       "5182(058)    3\n",
+       "5475(038)    3\n",
+       "Name: project_id, dtype: int64"
       ]
      },
-     "execution_count": 274,
+     "execution_count": 131,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "project_df[project_df.project_id ==  \"5288(046)\"].style.where(lambda val: 'Yes' in str(val), 'color: red')"
+    "awards_df.project_id.value_counts().head()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 275,
-   "id": "baf7d57e",
+   "execution_count": 132,
+   "id": "da2a3dcc",
    "metadata": {},
    "outputs": [
     {
@@ -4256,20 +3434,20 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>2416</th>\n",
-       "      <td>5475(038)</td>\n",
+       "      <th>1550</th>\n",
+       "      <td>5182(058)</td>\n",
        "      <td>Active Transportation Program (ATP)</td>\n",
        "      <td>2223</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2417</th>\n",
-       "      <td>5475(038)</td>\n",
+       "      <th>1551</th>\n",
+       "      <td>5182(058)</td>\n",
        "      <td>Local Assistance</td>\n",
        "      <td>2223</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>2418</th>\n",
-       "      <td>5475(038)</td>\n",
+       "      <th>1552</th>\n",
+       "      <td>5182(058)</td>\n",
        "      <td>Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017</td>\n",
        "      <td>2122</td>\n",
        "    </tr>\n",
@@ -4279,34 +3457,55 @@
       ],
       "text/plain": [
        "     project_id  \\\n",
-       "2416  5475(038)   \n",
-       "2417  5475(038)   \n",
-       "2418  5475(038)   \n",
+       "1550  5182(058)   \n",
+       "1551  5182(058)   \n",
+       "1552  5182(058)   \n",
        "\n",
        "                                                                                                                                                  grant_program  \\\n",
-       "2416                                                                                                                        Active Transportation Program (ATP)   \n",
-       "2417                                                                                                                                           Local Assistance   \n",
-       "2418  Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017   \n",
+       "1550                                                                                                                        Active Transportation Program (ATP)   \n",
+       "1551                                                                                                                                           Local Assistance   \n",
+       "1552  Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017   \n",
        "\n",
        "     state_fiscal_awarded_year  \n",
-       "2416                      2223  \n",
-       "2417                      2223  \n",
-       "2418                      2122  "
+       "1550                      2223  \n",
+       "1551                      2223  \n",
+       "1552                      2122  "
       ]
      },
-     "execution_count": 275,
+     "execution_count": 132,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "awards_df.loc[awards_df.project_id == \"5475(038)\"]"
+    "awards_df.loc[awards_df.project_id == \"5182(058)\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 133,
+   "id": "d6d426ce",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Check original df \n",
+    "# pect_df.loc[pect_df.project_id == \"5182(058)\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "id": "887d4afd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# project_df[project_df.project_id ==  \"5288(046)\"].style.where(lambda val: 'Yes' in str(val), 'color: red')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 276,
-   "id": "b089062a",
+   "execution_count": 135,
+   "id": "8bb2e36c",
    "metadata": {},
    "outputs": [
     {
@@ -4330,229 +3529,160 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>adv_project_id</th>\n",
-       "      <th>fund_code</th>\n",
-       "      <th>pec_code</th>\n",
-       "      <th>appropriation_category_code</th>\n",
-       "      <th>curr_bud_am</th>\n",
-       "      <th>cash_exp_am</th>\n",
-       "      <th>pect_task_code</th>\n",
        "      <th>project_id</th>\n",
-       "      <th>pec</th>\n",
-       "      <th>pect</th>\n",
-       "      <th>pect_description</th>\n",
-       "      <th>program</th>\n",
+       "      <th>grant_program</th>\n",
+       "      <th>state_fiscal_awarded_year</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>1639</th>\n",
-       "      <td>0315000005</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030720</td>\n",
-       "      <td>2223</td>\n",
-       "      <td>1512000.00</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>100</td>\n",
+       "      <th>2442</th>\n",
        "      <td>5475(038)</td>\n",
-       "      <td>2030720</td>\n",
-       "      <td>100.00</td>\n",
-       "      <td>Active Transportation Program (ATP)</td>\n",
        "      <td>Active Transportation Program (ATP)</td>\n",
+       "      <td>2223</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1640</th>\n",
-       "      <td>0315000005</td>\n",
-       "      <td>0042</td>\n",
-       "      <td>2030210</td>\n",
-       "      <td>2122</td>\n",
-       "      <td>2860000.00</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>350</td>\n",
-       "      <td>5475(038)</td>\n",
-       "      <td>2030210</td>\n",
-       "      <td>350.00</td>\n",
-       "      <td>Solutions for Congested Corridors Program (SCCP)</td>\n",
-       "      <td>Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1641</th>\n",
-       "      <td>0315000005</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1314</td>\n",
-       "      <td>1061999.97</td>\n",
-       "      <td>1061999.97</td>\n",
-       "      <td>820</td>\n",
-       "      <td>5475(038)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>820.00</td>\n",
-       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
-       "      <td>Local Assistance</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1642</th>\n",
-       "      <td>0315000005</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>1516</td>\n",
-       "      <td>2898000.00</td>\n",
-       "      <td>2898000.00</td>\n",
-       "      <td>820</td>\n",
+       "      <th>2443</th>\n",
        "      <td>5475(038)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>820.00</td>\n",
-       "      <td>Congestion Mitigation &amp; Air Quality Improvement Program (CMAQ)</td>\n",
        "      <td>Local Assistance</td>\n",
+       "      <td>2223</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>1643</th>\n",
-       "      <td>0315000005</td>\n",
-       "      <td>0890</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>2223</td>\n",
-       "      <td>9552155.00</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>810</td>\n",
+       "      <th>2444</th>\n",
        "      <td>5475(038)</td>\n",
-       "      <td>2030010</td>\n",
-       "      <td>810.00</td>\n",
-       "      <td>Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP)</td>\n",
-       "      <td>Local Assistance</td>\n",
+       "      <td>Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017</td>\n",
+       "      <td>2122</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "     adv_project_id fund_code pec_code appropriation_category_code  \\\n",
-       "1639     0315000005      0890  2030720                        2223   \n",
-       "1640     0315000005      0042  2030210                        2122   \n",
-       "1641     0315000005      0890  2030010                        1314   \n",
-       "1642     0315000005      0890  2030010                        1516   \n",
-       "1643     0315000005      0890  2030010                        2223   \n",
-       "\n",
-       "      curr_bud_am  cash_exp_am  pect_task_code project_id      pec   pect  \\\n",
-       "1639   1512000.00         0.00             100  5475(038)  2030720 100.00   \n",
-       "1640   2860000.00         0.00             350  5475(038)  2030210 350.00   \n",
-       "1641   1061999.97   1061999.97             820  5475(038)  2030010 820.00   \n",
-       "1642   2898000.00   2898000.00             820  5475(038)  2030010 820.00   \n",
-       "1643   9552155.00         0.00             810  5475(038)  2030010 810.00   \n",
+       "     project_id  \\\n",
+       "2442  5475(038)   \n",
+       "2443  5475(038)   \n",
+       "2444  5475(038)   \n",
        "\n",
-       "                                                                                           pect_description  \\\n",
-       "1639                                                                    Active Transportation Program (ATP)   \n",
-       "1640                                                       Solutions for Congested Corridors Program (SCCP)   \n",
-       "1641                                         Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
-       "1642                                         Congestion Mitigation & Air Quality Improvement Program (CMAQ)   \n",
-       "1643  Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP)   \n",
+       "                                                                                                                                                  grant_program  \\\n",
+       "2442                                                                                                                        Active Transportation Program (ATP)   \n",
+       "2443                                                                                                                                           Local Assistance   \n",
+       "2444  Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017   \n",
        "\n",
-       "                                                                                                                                                        program  \n",
-       "1639                                                                                                                        Active Transportation Program (ATP)  \n",
-       "1640  Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017  \n",
-       "1641                                                                                                                                           Local Assistance  \n",
-       "1642                                                                                                                                           Local Assistance  \n",
-       "1643                                                                                                                                           Local Assistance  "
+       "     state_fiscal_awarded_year  \n",
+       "2442                      2223  \n",
+       "2443                      2223  \n",
+       "2444                      2122  "
       ]
      },
-     "execution_count": 276,
+     "execution_count": 135,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
+   "source": [
+    "awards_df.loc[awards_df.project_id == \"5475(038)\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "id": "42649961",
+   "metadata": {},
+   "outputs": [],
    "source": [
     "# Check original df \n",
-    "pect_df.loc[pect_df.project_id ==  \"5475(038)\"]"
+    "# pect_df.loc[pect_df.project_id ==  \"5475(038)\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 277,
-   "id": "e6406250",
+   "execution_count": 137,
+   "id": "ebc41d72",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/html": [
        "<style  type=\"text/css\" >\n",
-       "#T_29512_row0_col20,#T_29512_row0_col24,#T_29512_row0_col47,#T_29512_row0_col53{\n",
+       "#T_b81e5_row0_col20,#T_b81e5_row0_col24,#T_b81e5_row0_col47,#T_b81e5_row0_col53{\n",
        "            color:  red;\n",
-       "        }</style><table id=\"T_29512_\" ><thead>    <tr>        <th class=\"blank level0\" ></th>        <th class=\"col_heading level0 col0\" >project_id</th>        <th class=\"col_heading level0 col1\" >comment_desc</th>        <th class=\"col_heading level0 col2\" >district_code</th>        <th class=\"col_heading level0 col3\" >est_total_prj_costs</th>        <th class=\"col_heading level0 col4\" >location_name</th>        <th class=\"col_heading level0 col5\" >project_label_name</th>        <th class=\"col_heading level0 col6\" >original_post_mile_begin_id</th>        <th class=\"col_heading level0 col7\" >original_post_mile_end_id</th>        <th class=\"col_heading level0 col8\" >revised_post_mile_begin_ind</th>        <th class=\"col_heading level0 col9\" >revised_post_mile_end_ind</th>        <th class=\"col_heading level0 col10\" >route_name</th>        <th class=\"col_heading level0 col11\" >state_hwy_ind</th>        <th class=\"col_heading level0 col12\" >senate_district_code</th>        <th class=\"col_heading level0 col13\" >update_date_time</th>        <th class=\"col_heading level0 col14\" >agency_name</th>        <th class=\"col_heading level0 col15\" >urban_area_code</th>        <th class=\"col_heading level0 col16\" >county_name</th>        <th class=\"col_heading level0 col17\" >work_type_desc</th>        <th class=\"col_heading level0 col18\" >category_desc</th>        <th class=\"col_heading level0 col19\" >current_phase</th>        <th class=\"col_heading level0 col20\" >active_transportation_program_(atp)</th>        <th class=\"col_heading level0 col21\" >bridge_inspection_&_scour_evaluation</th>        <th class=\"col_heading level0 col22\" >covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation</th>        <th class=\"col_heading level0 col23\" >carbon_reduction_program_(crp)</th>        <th class=\"col_heading level0 col24\" >congestion_mitigation_&_air_quality_improvement_program_(cmaq)</th>        <th class=\"col_heading level0 col25\" >coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds</th>        <th class=\"col_heading level0 col26\" >corridor_mobility_improvement_account_(cmia)_program</th>        <th class=\"col_heading level0 col27\" >county_exchange_funds</th>        <th class=\"col_heading level0 col28\" >county_state_match_program</th>        <th class=\"col_heading level0 col29\" >earmarks_projects_(hpp,_demo_cpfcds,_etc.)</th>        <th class=\"col_heading level0 col30\" >emergency_relief_(er)</th>        <th class=\"col_heading level0 col31\" >ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program</th>        <th class=\"col_heading level0 col32\" >funds_for_planning,_programming_and_monitoring_-_rip</th>        <th class=\"col_heading level0 col33\" >general_funded_designated_programs</th>        <th class=\"col_heading level0 col34\" >hazard_elimination_safety_(hes)</th>        <th class=\"col_heading level0 col35\" >high_risk_rural_roads_program_(hr3)</th>        <th class=\"col_heading level0 col36\" >highway_bridge_</th>        <th class=\"col_heading level0 col37\" >highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund</th>        <th class=\"col_heading level0 col38\" >highway_safety_improvement_program_(hsip)_(non-infrastructure)</th>        <th class=\"col_heading level0 col39\" >highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund</th>        <th class=\"col_heading level0 col40\" >local_partnership_program_(lpp_–_competitive)_</th>        <th class=\"col_heading level0 col41\" >local_roads</th>        <th class=\"col_heading level0 col42\" >local_roads_rehabilitation</th>        <th class=\"col_heading level0 col43\" >railroad_grade_crossing_protection</th>        <th class=\"col_heading level0 col44\" >railroad_grade_separations</th>        <th class=\"col_heading level0 col45\" >rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_</th>        <th class=\"col_heading level0 col46\" >regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system)</th>        <th class=\"col_heading level0 col47\" >regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip)</th>        <th class=\"col_heading level0 col48\" >regional_transportation_planning_agency_(rtpa)_stp_match_exchange</th>        <th class=\"col_heading level0 col49\" >sb1_funded_freeway_service_patrol</th>        <th class=\"col_heading level0 col50\" >shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds</th>        <th class=\"col_heading level0 col51\" >safe_routes_to_school_(sr2s_and_srts)</th>        <th class=\"col_heading level0 col52\" >set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act</th>        <th class=\"col_heading level0 col53\" >solutions_for_congested_corridors_program_(sccp)</th>        <th class=\"col_heading level0 col54\" >special_programs</th>        <th class=\"col_heading level0 col55\" >state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic)</th>        <th class=\"col_heading level0 col56\" >structures_seismic_retrofit_</th>        <th class=\"col_heading level0 col57\" >trade_corridor_enhancement_account_(tcea)_programs_–_local_share</th>        <th class=\"col_heading level0 col58\" >trade_corridor_enhancement_account_(tcea)_programs_–_state_share</th>        <th class=\"col_heading level0 col59\" >trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads</th>        <th class=\"col_heading level0 col60\" >traffic_congestion_relief_program_(_tcrp_)</th>        <th class=\"col_heading level0 col61\" >unknown</th>    </tr></thead><tbody>\n",
+       "        }</style><table id=\"T_b81e5_\" ><thead>    <tr>        <th class=\"blank level0\" ></th>        <th class=\"col_heading level0 col0\" >project_id</th>        <th class=\"col_heading level0 col1\" >comment_desc</th>        <th class=\"col_heading level0 col2\" >district_code</th>        <th class=\"col_heading level0 col3\" >est_total_prj_costs</th>        <th class=\"col_heading level0 col4\" >location_name</th>        <th class=\"col_heading level0 col5\" >project_label_name</th>        <th class=\"col_heading level0 col6\" >original_post_mile_begin_id</th>        <th class=\"col_heading level0 col7\" >original_post_mile_end_id</th>        <th class=\"col_heading level0 col8\" >revised_post_mile_begin_ind</th>        <th class=\"col_heading level0 col9\" >revised_post_mile_end_ind</th>        <th class=\"col_heading level0 col10\" >route_name</th>        <th class=\"col_heading level0 col11\" >state_hwy_ind</th>        <th class=\"col_heading level0 col12\" >senate_district_code</th>        <th class=\"col_heading level0 col13\" >update_date_time</th>        <th class=\"col_heading level0 col14\" >agency_name</th>        <th class=\"col_heading level0 col15\" >urban_area_code</th>        <th class=\"col_heading level0 col16\" >county_name</th>        <th class=\"col_heading level0 col17\" >work_type_desc</th>        <th class=\"col_heading level0 col18\" >category_desc</th>        <th class=\"col_heading level0 col19\" >current_phase</th>        <th class=\"col_heading level0 col20\" >active_transportation_program_(atp)</th>        <th class=\"col_heading level0 col21\" >bridge_inspection_&_scour_evaluation</th>        <th class=\"col_heading level0 col22\" >covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation</th>        <th class=\"col_heading level0 col23\" >carbon_reduction_program_(crp)</th>        <th class=\"col_heading level0 col24\" >congestion_mitigation_&_air_quality_improvement_program_(cmaq)</th>        <th class=\"col_heading level0 col25\" >coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds</th>        <th class=\"col_heading level0 col26\" >corridor_mobility_improvement_account_(cmia)_program</th>        <th class=\"col_heading level0 col27\" >county_exchange_funds</th>        <th class=\"col_heading level0 col28\" >county_state_match_program</th>        <th class=\"col_heading level0 col29\" >earmarks_projects_(hpp,_demo_cpfcds,_etc.)</th>        <th class=\"col_heading level0 col30\" >emergency_relief_(er)</th>        <th class=\"col_heading level0 col31\" >ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program</th>        <th class=\"col_heading level0 col32\" >funds_for_planning,_programming_and_monitoring_-_rip</th>        <th class=\"col_heading level0 col33\" >general_funded_designated_programs</th>        <th class=\"col_heading level0 col34\" >hazard_elimination_safety_(hes)</th>        <th class=\"col_heading level0 col35\" >high_risk_rural_roads_program_(hr3)</th>        <th class=\"col_heading level0 col36\" >highway_bridge_</th>        <th class=\"col_heading level0 col37\" >highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund</th>        <th class=\"col_heading level0 col38\" >highway_safety_improvement_program_(hsip)_(non-infrastructure)</th>        <th class=\"col_heading level0 col39\" >highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund</th>        <th class=\"col_heading level0 col40\" >local_partnership_program_(lpp_–_competitive)_</th>        <th class=\"col_heading level0 col41\" >local_roads</th>        <th class=\"col_heading level0 col42\" >local_roads_rehabilitation</th>        <th class=\"col_heading level0 col43\" >railroad_grade_crossing_protection</th>        <th class=\"col_heading level0 col44\" >railroad_grade_separations</th>        <th class=\"col_heading level0 col45\" >rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_</th>        <th class=\"col_heading level0 col46\" >regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system)</th>        <th class=\"col_heading level0 col47\" >regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip)</th>        <th class=\"col_heading level0 col48\" >regional_transportation_planning_agency_(rtpa)_stp_match_exchange</th>        <th class=\"col_heading level0 col49\" >sb1_funded_freeway_service_patrol</th>        <th class=\"col_heading level0 col50\" >shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds</th>        <th class=\"col_heading level0 col51\" >safe_routes_to_school_(sr2s_and_srts)</th>        <th class=\"col_heading level0 col52\" >set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act</th>        <th class=\"col_heading level0 col53\" >solutions_for_congested_corridors_program_(sccp)</th>        <th class=\"col_heading level0 col54\" >special_programs</th>        <th class=\"col_heading level0 col55\" >state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic)</th>        <th class=\"col_heading level0 col56\" >structures_seismic_retrofit_</th>        <th class=\"col_heading level0 col57\" >trade_corridor_enhancement_account_(tcea)_programs_–_local_share</th>        <th class=\"col_heading level0 col58\" >trade_corridor_enhancement_account_(tcea)_programs_–_state_share</th>        <th class=\"col_heading level0 col59\" >trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads</th>        <th class=\"col_heading level0 col60\" >traffic_congestion_relief_program_(_tcrp_)</th>        <th class=\"col_heading level0 col61\" >unknown</th>    </tr></thead><tbody>\n",
        "                <tr>\n",
-       "                        <th id=\"T_29512_level0_row0\" class=\"row_heading level0 row0\" >2664</th>\n",
-       "                        <td id=\"T_29512_row0_col0\" class=\"data row0 col0\" >5475(038)</td>\n",
-       "                        <td id=\"T_29512_row0_col1\" class=\"data row0 col1\" >10/1/2020:   Original AED date was 9/30/2020, new sequence being done to extend date to 9/30/2022.   There will be a gap of time that is not reimbursable.  CR\n",
+       "                        <th id=\"T_b81e5_level0_row0\" class=\"row_heading level0 row0\" >2418</th>\n",
+       "                        <td id=\"T_b81e5_row0_col0\" class=\"data row0 col0\" >5475(038)</td>\n",
+       "                        <td id=\"T_b81e5_row0_col1\" class=\"data row0 col1\" >10/1/2020:   Original AED date was 9/30/2020, new sequence being done to extend date to 9/30/2022.   There will be a gap of time that is not reimbursable.  CR\n",
        "\n",
        "CMAQ Emissions Benefit:  .03 ROG, .02 NOx, .01 PM10\n",
        "Project has EPSP approval for $2,646,524 of CMAQ for R/W to 15/16 FY.   And EPSP for $291,476 of CMAQ for PE to 15/16 FY.\n",
        "</td>\n",
-       "                        <td id=\"T_29512_row0_col2\" class=\"data row0 col2\" >03</td>\n",
-       "                        <td id=\"T_29512_row0_col3\" class=\"data row0 col3\" >36291000.000000</td>\n",
-       "                        <td id=\"T_29512_row0_col4\" class=\"data row0 col4\" >Auburn Blvd. Complete Streets - Phase 2. On Auburn Blvd, in Citrus Heights from Rusch Park to Northern City Limits.</td>\n",
-       "                        <td id=\"T_29512_row0_col5\" class=\"data row0 col5\" >Pedestrian and Bike Path</td>\n",
-       "                        <td id=\"T_29512_row0_col6\" class=\"data row0 col6\" >nan</td>\n",
-       "                        <td id=\"T_29512_row0_col7\" class=\"data row0 col7\" >nan</td>\n",
-       "                        <td id=\"T_29512_row0_col8\" class=\"data row0 col8\" >None</td>\n",
-       "                        <td id=\"T_29512_row0_col9\" class=\"data row0 col9\" >None</td>\n",
-       "                        <td id=\"T_29512_row0_col10\" class=\"data row0 col10\" >0-CHts</td>\n",
-       "                        <td id=\"T_29512_row0_col11\" class=\"data row0 col11\" >N</td>\n",
-       "                        <td id=\"T_29512_row0_col12\" class=\"data row0 col12\" >None</td>\n",
-       "                        <td id=\"T_29512_row0_col13\" class=\"data row0 col13\" >2023-10-13 13:34:21</td>\n",
-       "                        <td id=\"T_29512_row0_col14\" class=\"data row0 col14\" >Citrus Heights</td>\n",
-       "                        <td id=\"T_29512_row0_col15\" class=\"data row0 col15\" >3067</td>\n",
-       "                        <td id=\"T_29512_row0_col16\" class=\"data row0 col16\" >Sacramento County</td>\n",
-       "                        <td id=\"T_29512_row0_col17\" class=\"data row0 col17\" >None</td>\n",
-       "                        <td id=\"T_29512_row0_col18\" class=\"data row0 col18\" >Pedestrian and Bike Path</td>\n",
-       "                        <td id=\"T_29512_row0_col19\" class=\"data row0 col19\" >single phase</td>\n",
-       "                        <td id=\"T_29512_row0_col20\" class=\"data row0 col20\" >Yes</td>\n",
-       "                        <td id=\"T_29512_row0_col21\" class=\"data row0 col21\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col22\" class=\"data row0 col22\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col23\" class=\"data row0 col23\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col24\" class=\"data row0 col24\" >Yes</td>\n",
-       "                        <td id=\"T_29512_row0_col25\" class=\"data row0 col25\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col26\" class=\"data row0 col26\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col27\" class=\"data row0 col27\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col28\" class=\"data row0 col28\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col29\" class=\"data row0 col29\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col30\" class=\"data row0 col30\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col31\" class=\"data row0 col31\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col32\" class=\"data row0 col32\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col33\" class=\"data row0 col33\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col34\" class=\"data row0 col34\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col35\" class=\"data row0 col35\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col36\" class=\"data row0 col36\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col37\" class=\"data row0 col37\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col38\" class=\"data row0 col38\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col39\" class=\"data row0 col39\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col40\" class=\"data row0 col40\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col41\" class=\"data row0 col41\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col42\" class=\"data row0 col42\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col43\" class=\"data row0 col43\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col44\" class=\"data row0 col44\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col45\" class=\"data row0 col45\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col46\" class=\"data row0 col46\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col47\" class=\"data row0 col47\" >Yes</td>\n",
-       "                        <td id=\"T_29512_row0_col48\" class=\"data row0 col48\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col49\" class=\"data row0 col49\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col50\" class=\"data row0 col50\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col51\" class=\"data row0 col51\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col52\" class=\"data row0 col52\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col53\" class=\"data row0 col53\" >Yes</td>\n",
-       "                        <td id=\"T_29512_row0_col54\" class=\"data row0 col54\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col55\" class=\"data row0 col55\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col56\" class=\"data row0 col56\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col57\" class=\"data row0 col57\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col58\" class=\"data row0 col58\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col59\" class=\"data row0 col59\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col60\" class=\"data row0 col60\" >No</td>\n",
-       "                        <td id=\"T_29512_row0_col61\" class=\"data row0 col61\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col2\" class=\"data row0 col2\" >03</td>\n",
+       "                        <td id=\"T_b81e5_row0_col3\" class=\"data row0 col3\" >36291000.000000</td>\n",
+       "                        <td id=\"T_b81e5_row0_col4\" class=\"data row0 col4\" >Auburn Blvd. Complete Streets - Phase 2. On Auburn Blvd, in Citrus Heights from Rusch Park to Northern City Limits.</td>\n",
+       "                        <td id=\"T_b81e5_row0_col5\" class=\"data row0 col5\" >Pedestrian and Bike Path</td>\n",
+       "                        <td id=\"T_b81e5_row0_col6\" class=\"data row0 col6\" >nan</td>\n",
+       "                        <td id=\"T_b81e5_row0_col7\" class=\"data row0 col7\" >nan</td>\n",
+       "                        <td id=\"T_b81e5_row0_col8\" class=\"data row0 col8\" >None</td>\n",
+       "                        <td id=\"T_b81e5_row0_col9\" class=\"data row0 col9\" >None</td>\n",
+       "                        <td id=\"T_b81e5_row0_col10\" class=\"data row0 col10\" >0-CHts</td>\n",
+       "                        <td id=\"T_b81e5_row0_col11\" class=\"data row0 col11\" >N</td>\n",
+       "                        <td id=\"T_b81e5_row0_col12\" class=\"data row0 col12\" >None</td>\n",
+       "                        <td id=\"T_b81e5_row0_col13\" class=\"data row0 col13\" >2023-10-13 13:34:21</td>\n",
+       "                        <td id=\"T_b81e5_row0_col14\" class=\"data row0 col14\" >Citrus Heights</td>\n",
+       "                        <td id=\"T_b81e5_row0_col15\" class=\"data row0 col15\" >3067</td>\n",
+       "                        <td id=\"T_b81e5_row0_col16\" class=\"data row0 col16\" >Sacramento County</td>\n",
+       "                        <td id=\"T_b81e5_row0_col17\" class=\"data row0 col17\" >None</td>\n",
+       "                        <td id=\"T_b81e5_row0_col18\" class=\"data row0 col18\" >Pedestrian and Bike Path</td>\n",
+       "                        <td id=\"T_b81e5_row0_col19\" class=\"data row0 col19\" >single phase</td>\n",
+       "                        <td id=\"T_b81e5_row0_col20\" class=\"data row0 col20\" >Yes</td>\n",
+       "                        <td id=\"T_b81e5_row0_col21\" class=\"data row0 col21\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col22\" class=\"data row0 col22\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col23\" class=\"data row0 col23\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col24\" class=\"data row0 col24\" >Yes</td>\n",
+       "                        <td id=\"T_b81e5_row0_col25\" class=\"data row0 col25\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col26\" class=\"data row0 col26\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col27\" class=\"data row0 col27\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col28\" class=\"data row0 col28\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col29\" class=\"data row0 col29\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col30\" class=\"data row0 col30\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col31\" class=\"data row0 col31\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col32\" class=\"data row0 col32\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col33\" class=\"data row0 col33\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col34\" class=\"data row0 col34\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col35\" class=\"data row0 col35\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col36\" class=\"data row0 col36\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col37\" class=\"data row0 col37\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col38\" class=\"data row0 col38\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col39\" class=\"data row0 col39\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col40\" class=\"data row0 col40\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col41\" class=\"data row0 col41\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col42\" class=\"data row0 col42\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col43\" class=\"data row0 col43\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col44\" class=\"data row0 col44\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col45\" class=\"data row0 col45\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col46\" class=\"data row0 col46\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col47\" class=\"data row0 col47\" >Yes</td>\n",
+       "                        <td id=\"T_b81e5_row0_col48\" class=\"data row0 col48\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col49\" class=\"data row0 col49\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col50\" class=\"data row0 col50\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col51\" class=\"data row0 col51\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col52\" class=\"data row0 col52\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col53\" class=\"data row0 col53\" >Yes</td>\n",
+       "                        <td id=\"T_b81e5_row0_col54\" class=\"data row0 col54\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col55\" class=\"data row0 col55\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col56\" class=\"data row0 col56\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col57\" class=\"data row0 col57\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col58\" class=\"data row0 col58\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col59\" class=\"data row0 col59\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col60\" class=\"data row0 col60\" >No</td>\n",
+       "                        <td id=\"T_b81e5_row0_col61\" class=\"data row0 col61\" >No</td>\n",
        "            </tr>\n",
        "    </tbody></table>"
       ],
       "text/plain": [
-       "<pandas.io.formats.style.Styler at 0x25bda4194c8>"
+       "<pandas.io.formats.style.Styler at 0x1d6d8f99288>"
       ]
      },
-     "execution_count": 277,
+     "execution_count": 137,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4563,7 +3693,7 @@
   },
   {
    "cell_type": "markdown",
-   "id": "643a54b1",
+   "id": "6f411cf3",
    "metadata": {},
    "source": [
     "## Save to Excel/Final Touches"
@@ -4571,8 +3701,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 278,
-   "id": "ccc8fee4",
+   "execution_count": 138,
+   "id": "fa95350c",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4583,17 +3713,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 279,
-   "id": "12096bef",
+   "execution_count": 139,
+   "id": "d3b61876",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(11768, 62)"
+       "(11272, 62)"
       ]
      },
-     "execution_count": 279,
+     "execution_count": 139,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4604,17 +3734,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 280,
-   "id": "f9e11b2a",
+   "execution_count": 140,
+   "id": "4b0c3eb1",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "11768"
+       "11272"
       ]
      },
-     "execution_count": 280,
+     "execution_count": 140,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -4625,8 +3755,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 281,
-   "id": "d1896385",
+   "execution_count": 141,
+   "id": "63073ff2",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4636,8 +3766,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 282,
-   "id": "4a61a680",
+   "execution_count": 142,
+   "id": "eded55f3",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4647,8 +3777,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 283,
-   "id": "99035caf",
+   "execution_count": 143,
+   "id": "b65fdd45",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -4657,18 +3787,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 284,
-   "id": "96b26252",
+   "execution_count": 147,
+   "id": "ebc5cc10",
    "metadata": {},
    "outputs": [],
    "source": [
     "\n",
-    "with pd.ExcelWriter(\"./LP2000.xlsx\") as writer:\n",
+    "with pd.ExcelWriter(\"LP2000_projects.xlsx\") as writer:\n",
     "    project_df.to_excel(writer, sheet_name=\"project\", index=False)\n",
     "    county_df.to_excel(writer, sheet_name=\"county\", index=False)\n",
     "    district_df.to_excel(writer, sheet_name=\"district\", index=False)\n",
     "    awards_df.to_excel(writer, sheet_name=\"awards\", index=False)\n",
-    "    final_fund_phase_df.to_excel(writer, sheet_name=\"phase_funding\", index=False)\n"
+    "    final_fund_phase_df2.to_excel(writer, sheet_name=\"phase_funding\", index=False)\n"
    ]
   }
  ],
diff --git a/project_list/archive_compile_all_projects.ipynb b/project_list/archive_compile_all_projects.ipynb
new file mode 100644
index 000000000..1863ec094
--- /dev/null
+++ b/project_list/archive_compile_all_projects.ipynb
@@ -0,0 +1,2642 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "77106c12-82aa-4be4-8d9c-e66fafec4d67",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "## General function to clean up data from various grants\n",
+    "To-Do\n",
+    "* De duplicate projects\n",
+    "* Rearrange counties in County column in alphabetical order.\n",
+    "* Millions to thousands -> seems easier to read.\n",
+    "* Differentiate btwn project START year and END year.\n",
+    "* Add Post Mile column\n",
+    "\n",
+    "Done\n",
+    "* Switch City of Berkeley to Berkeley City. https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb\n",
+    "\n",
+    "Strategy/Questions:\n",
+    "* Make sure one row=one project. How? \n",
+    "* What should be the unit of project cost?\n",
+    "* Break up Caltrans by district or leave as is? \n",
+    "\n",
+    "Columns/Data Dictionary\n",
+    "* project_title (str): N/A.\n",
+    "* lead_agency (str): the entity leading the project or receiving the grant.\n",
+    "* project_year (TBD): when the project will begin.\n",
+    "* project_category (str): the category/categories a project belongs to.\n",
+    "* grant_program (str): the fund a project is receiving funds for. This does not preclude the fact that a project can receive funds from mulitple programs. \n",
+    "* phase (str): the latest phase the project is in.\n",
+    "* project_description (str): N/A.\n",
+    "* total_project_cost_(millions): N/A.\n",
+    "* total_available_funds_(millions): all the funds available to the project.\n",
+    "* unfunded_needs_(millions): subtract total_project_cost_(millions) by total_available_funds_(millionis).\n",
+    "* city (str): the city a project is located in.\n",
+    "* county (str): the county a project is lcoated in.\n",
+    "* location (str): an address or more detailed information regarding where the project will take place.\n",
+    "* geometry: geospatial information.\n",
+    "* data_source (str): N/A.\n",
+    "* notes (str): additional information regarding the project.\n",
+    "* funding_notes (str): additional funding information regarding the project.\n",
+    "* ct_district (int): the Caltrans district a project is located in.\n",
+    "* fully_funded (str): comparing total_available_funds_(millions) and total_project_cost_(millions) to figure out whether a project is fully, partially, or not funded.\n",
+    "* enough_info (str): counting the # of null values and # of strings in the project description to determine whether or not a project has enough information."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "06ac19fe-7b6c-4560-9740-8a4f72c5b6e1",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n",
+      "  warnings.warn(\n",
+      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n",
+      "\n",
+      "import os\n",
+      "os.environ['USE_PYGEOS'] = '0'\n",
+      "import geopandas\n",
+      "\n",
+      "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n",
+      "  import geopandas as gpd\n"
+     ]
+    }
+   ],
+   "source": [
+    "import _cleaning_utils\n",
+    "import _harmonization_utils as harmonization_utils\n",
+    "import _state_rail_plan_utils as srp_utils\n",
+    "# import geopandas as gpd\n",
+    "import pandas as pd\n",
+    "# import shapely\n",
+    "from calitp_data_analysis.sql import to_snakecase"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d78be4e7-2349-4ffd-9d59-f9fa450ae7dd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'\\nimport re\\nimport nltk\\nfrom nltk import ngrams\\nfrom nltk.corpus import stopwords\\nfrom nltk.tokenize import sent_tokenize, word_tokenize\\nimport re\\nfrom collections import Counter\\nfrom autocorrect import Speller\\n'"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\"\"\"\n",
+    "import re\n",
+    "import nltk\n",
+    "from nltk import ngrams\n",
+    "from nltk.corpus import stopwords\n",
+    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
+    "import re\n",
+    "from collections import Counter\n",
+    "from autocorrect import Speller\n",
+    "\"\"\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "e7b68eeb-422d-4be8-b557-7bd9e95599af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pd.options.display.max_columns = 100\n",
+    "pd.options.display.float_format = \"{:.2f}\".format\n",
+    "pd.set_option(\"display.max_rows\", None)\n",
+    "pd.set_option(\"display.max_colwidth\", None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0541b671-a020-485f-9b0a-f46238f1d4f9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# lost = harmonization_utils.load_lost()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "db720477-44f5-4cbd-80ac-a0fe86e47cf9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def create_notes(df, note_cols: list, new_col_name: str):\n",
+    "    \"\"\"\n",
+    "    Concat multiple columns into one.\n",
+    "    \"\"\"\n",
+    "    prefix = \"_\"\n",
+    "    for column in note_cols:\n",
+    "        df[f\"{prefix}{column}\"] = df[column].astype(str)\n",
+    "    note_cols = [prefix + sub for sub in note_cols]\n",
+    "\n",
+    "    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values\n",
+    "    def combine_notes(x):\n",
+    "        return \", \".join([col + \": \" + x[col] for col in note_cols])\n",
+    "\n",
+    "    df[new_col_name] = df.apply(combine_notes, axis=1)\n",
+    "    df[new_col_name] = df[new_col_name].str.replace(\"_\", \" \")\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "78b5d13c-f4ba-4baf-8c3a-f520a960a44a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# srp = harmonization_utils.load_state_rail_plan()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f3829bd6-8fc1-4c15-809f-75020248a722",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "columns_to_keep = [\n",
+    "    \"project_title\",\n",
+    "    \"lead_agency\",\n",
+    "    \"project_year\",\n",
+    "    \"project_category\",\n",
+    "    \"project_start_year\",\n",
+    "    \"project_completion_year\",\n",
+    "    \"grant_program\",\n",
+    "    \"phase\",\n",
+    "    \"project_description\",\n",
+    "    \"total_project_cost_(millions)\",\n",
+    "    \"total_available_funds_(millions)\",\n",
+    "    \"unfunded_needs_(millions)\",\n",
+    "    \"city\",\n",
+    "    \"county\",\n",
+    "    \"location\",\n",
+    "    \"post_mile\",\n",
+    "    \"geometry\",\n",
+    "    \"data_source\",\n",
+    "    \"notes\",\n",
+    "    \"funding_notes\",\n",
+    "    \"ct_district\",\n",
+    "    \"project_description2\",\n",
+    "]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "291e821d-9b3f-40a2-bde9-7a12b31eb410",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def harmonizing(\n",
+    "    df,\n",
+    "    agency_name_col: str,\n",
+    "    project_name_col: str,\n",
+    "    project_description_col: str,\n",
+    "    project_category_col: str,\n",
+    "    phase_col: str,\n",
+    "    project_cost_col: str,\n",
+    "    location_col: str,\n",
+    "    geography_col: str,\n",
+    "    post_mile_col:str,\n",
+    "    county_col: str,\n",
+    "    city_col: str,\n",
+    "    district_col:str, \n",
+    "    project_start_year_col: str,\n",
+    "    project_completion_year_col:str,\n",
+    "    program_col: str,\n",
+    "    data_source: str,\n",
+    "    fund_cols: list,\n",
+    "    notes_cols: list,\n",
+    "    cost_in_millions: bool = True,\n",
+    "):\n",
+    "    \"\"\"\n",
+    "    Take a dataset and change the column names/types to\n",
+    "    default names and formats.\n",
+    "    \"\"\"\n",
+    "    rename_columns = {\n",
+    "        agency_name_col: \"lead_agency\",\n",
+    "        project_name_col: \"project_title\",\n",
+    "        project_description_col: \"project_description\",\n",
+    "        project_category_col: \"project_category\",\n",
+    "        project_cost_col: \"total_project_cost_(millions)\",\n",
+    "        location_col: \"location\",\n",
+    "        geography_col: \"geometry\",\n",
+    "        phase_col: \"phase\",\n",
+    "        post_mile_col: \"post_mile\",\n",
+    "        county_col: \"county\",\n",
+    "        city_col: \"city\",\n",
+    "        district_col: \"ct_district\",\n",
+    "        project_start_year_col: \"project_start_year\",\n",
+    "        project_end_year_col: \"project_completion_year\",\n",
+    "        program_col: \"grant_program\",\n",
+    "    }\n",
+    "    # Rename columns\n",
+    "    df = df.rename(columns=rename_columns)\n",
+    "    \n",
+    "    # Clean up monetary columns to be interger\n",
+    "    cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+    "    for i in cost_columns:\n",
+    "        df[i] = df[i].apply(pd.to_numeric, errors=\"coerce\").fillna(0)\n",
+    "    \n",
+    "    # Divide cost columns by millions\n",
+    "    # If bool is set to True\n",
+    "    if cost_in_millions:\n",
+    "        for i in fund_cols + [\"total_project_cost_(millions)\"]:\n",
+    "            df[i] = df[i].divide(1_000_000)\n",
+    "\n",
+    "    # Add new column with funding breakout\n",
+    "    # Since it's summarized above and the details are suppressed.\n",
+    "    df[\"total_available_funds_(millions)\"] = df[fund_cols].sum(axis=1)\n",
+    "    df = create_notes(df, fund_cols, \"funding_notes\")\n",
+    "    \n",
+    "    # Add column for unfunded needs\n",
+    "    df[\"unfunded_needs_(millions)\"] = df[\"total_project_cost_(millions)\"] - df[\"total_available_funds_(millions)\"]\n",
+    "    \n",
+    "    # Add program\n",
+    "    df[\"data_source\"] = data_source\n",
+    "    \n",
+    "    # Create columns even if they don't exist, just to harmonize\n",
+    "    # before concatting.\n",
+    "    create_columns = [\n",
+    "        \"county\",\n",
+    "        \"city\",\n",
+    "        \"notes\",\n",
+    "        \"project_start_year\",\n",
+    "        \"project_completion_year\",\n",
+    "        \"post_mile\",\n",
+    "        \"project_category\",\n",
+    "        \"location\",\n",
+    "        \"phase\",\n",
+    "        \"ct_district\"\n",
+    "    ]\n",
+    "    for column in create_columns:\n",
+    "        if column not in df:\n",
+    "            df[column] = \"None\"\n",
+    "    if \"geometry\" not in df:\n",
+    "        df[\"geometry\"] = None\n",
+    "    if \"grant_program\" not in df:\n",
+    "        df[\"grant_program\"] = data_source\n",
+    "    \n",
+    "    # Create notes - aka other columns that were supressed\n",
+    "    df = create_notes(df, notes_cols, \"notes\")\n",
+    "    \n",
+    "    # Clean up string columns\n",
+    "    string_cols = df.select_dtypes(include=[\"object\"]).columns.to_list()\n",
+    "    for i in string_cols:\n",
+    "        df[i] = df[i].str.replace(\"_\", \" \").str.strip().str.title()\n",
+    "\n",
+    "    # Fill in any nulls\n",
+    "    df['project_description2'] = df.project_description.fillna(df.project_title)\n",
+    "    df = df.fillna(df.dtypes.replace({\"float64\": 0.0, \"object\": \"None\"}))\n",
+    "\n",
+    "    # Only keep certain columns\n",
+    "    df = df[columns_to_keep]\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "0ea5badb-841b-4941-b48f-23d750b5ed27",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def harmonize_srp():\n",
+    "    df = harmonization_utils.load_state_rail_plan()\n",
+    "    df = harmonizing(\n",
+    "        df,\n",
+    "        agency_name_col=\"lead_agency\",\n",
+    "        project_name_col=\"project_name\",\n",
+    "        project_description_col=\"project_description\",\n",
+    "        project_category_col=\"project_category\",\n",
+    "        phase_col=\"\",\n",
+    "        project_cost_col=\"total_project_cost\",\n",
+    "        location_col=\"corridor\",\n",
+    "        geography_col=\"\",\n",
+    "        county_col=\"\",\n",
+    "        city_col=\"\",\n",
+    "        district_col=\"\",\n",
+    "        project_year_col=\"\",\n",
+    "        program_col=\"\",\n",
+    "        data_source=\"State Rail Plan\",\n",
+    "        fund_cols=[],\n",
+    "        notes_cols = ['project_time_horizon','srp_region', \n",
+    "       'sub_corridor_node_1', 'sub_corridor_node_2', 'itsp_corridor'],\n",
+    "        cost_in_millions=True,\n",
+    "    )\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "2b60a4e7-cc69-41fb-9285-c32f9fa0791e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# srp_harmonized = harmonize_srp()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "2100f3b7-55c8-45ad-b3d1-99a0319c7ac8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# srp_harmonized.tail()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "3142aacb-d5f4-4bc1-8cc8-99f50c45b301",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# srp_og = harmonization_utils.load_state_rail_plan()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "0ae0b8bd-3e5b-4119-8fee-d496689f9c7c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# srp_og.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9307a340-c699-4d93-ba30-abe04563dd8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# srp_og.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "23804222-466a-4754-a1ad-fd8f3f8a5239",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def harmonize_lost():\n",
+    "    df = harmonization_utils.load_lost()\n",
+    "    df = harmonizing(\n",
+    "        df,\n",
+    "        agency_name_col=\"agency\",\n",
+    "        project_name_col=\"project_title\",\n",
+    "        project_description_col=\"project_description\",\n",
+    "        project_category_col=\"project_category\",\n",
+    "        project_cost_col=\"cost__in_millions_\",\n",
+    "        phase_col=\"\",\n",
+    "        location_col=\"location\",\n",
+    "        geography_col=\"\",\n",
+    "        county_col=\"county\",\n",
+    "        city_col=\"city\",\n",
+    "        district_col = \"\",\n",
+    "        project_year_col=\"\",\n",
+    "        program_col=\"measure\",\n",
+    "        data_source=\"Local Options Sales Tax\",\n",
+    "        fund_cols=[\n",
+    "            \"estimated_lost_funds\",\n",
+    "            \"estimated_federal_funds\",\n",
+    "            \"estimated_state_funds\",\n",
+    "            \"estimated_local_funds\",\n",
+    "            \"estimated_other_funds\",\n",
+    "        ],\n",
+    "        notes_cols = [\"notes\"],\n",
+    "        cost_in_millions=False,\n",
+    "    )\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "e13f87d5-514f-404f-8cc8-4dbf877754da",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# lost_og = harmonization_utils.load_lost()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "60d66f94-7915-43b7-990e-896600e20d40",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# lost_og.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "20e8a81a-e6b1-4bdf-a0f8-21420c62b68a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def harmonize_sb1():\n",
+    "    df = harmonization_utils.load_sb1()\n",
+    "    df = harmonizing(\n",
+    "        df,\n",
+    "        agency_name_col=\"implementingagency\",\n",
+    "        project_name_col=\"projecttitle_x\",\n",
+    "        project_description_col=\"projectdescription\",\n",
+    "        project_category_col=\"\",\n",
+    "        phase_col=\"projectstatuses\",\n",
+    "        project_cost_col=\"totalcost\",\n",
+    "        location_col=\"\",\n",
+    "        geography_col=\"geometry\",\n",
+    "        county_col=\"countynames\",\n",
+    "        city_col=\"citynames\",\n",
+    "        district_col = \"ct_districts\",\n",
+    "        project_year_col=\"fiscalyears\",\n",
+    "        program_col=\"programcodes\",\n",
+    "        data_source=\"SB1\",\n",
+    "        fund_cols=[\"sb1funds\", \"iijafunds\"],\n",
+    "        notes_cols = ['iijaprogram','dateupdated','isonshs', 'isonshscodes','agencies', 'popup'],\n",
+    "        cost_in_millions=True,\n",
+    "    )\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "4d39d086-ef36-4f21-ab44-17980304be74",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sb1_og = harmonization_utils.load_sb1()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "9c65170e-17ef-42da-b161-358e40f815a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sb1_og.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "5ade46ae-4768-4855-b0ea-9ff4ec7607af",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# sb1_og.drop(columns = ['geometry']).sample(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "5cab53c4-9c63-4bd4-b837-f43e62900e8d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# harmonized_sb1 = harmonize_sb1()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6c14df71-56af-43a1-b0e0-2d02ef38e18e",
+   "metadata": {},
+   "source": [
+    "### Stacking"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "53a8e2a2-9d49-4e55-a2ee-bd6224d7fb61",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "#### Does this project have enough information to be useful?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "3926aa77-991b-48be-b57d-04077a0a485b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def categorize_info(df): \n",
+    " \n",
+    "    #Get percentiles in objects for total vehicle.\n",
+    "    p50_project_desc= df.project_description_count.quantile(0.50).astype(float)\n",
+    "    p50_null_values = df.total_percent_null_values.quantile(0.50).astype(float)\n",
+    "    \n",
+    "    #Function for fleet size\n",
+    "    def percentile_info (row):\n",
+    "        if ((row.project_description_count >= p50_project_desc) and (row.total_percent_null_values <= p50_null_values)):\n",
+    "            return \"Yes\"\n",
+    "        else: \n",
+    "            return \"No\"\n",
+    "    df[\"enough_info\"] = df.apply(lambda x: percentile_info(x), axis=1)\n",
+    "  \n",
+    "    return df    "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "48854cb1-3fa8-4d4e-8e8f-7218fc8b9c7e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def enough_info(df):\n",
+    "    # Select string columns\n",
+    "    string_cols = df.select_dtypes(include=[\"object\"]).columns.to_list()\n",
+    "    \n",
+    "    # https://stackoverflow.com/questions/73839250/count-number-of-occurrences-of-text-over-row-python-pandas\n",
+    "    # Count \"nones\" in string columns\n",
+    "    df['none_counts'] = df[string_cols].astype(str).sum(axis=1).str.lower().str.count(\"none\")\n",
+    "    \n",
+    "    # Count zeroes\n",
+    "    df['zero_counts'] = (df == 0).astype(int).sum(axis=1)\n",
+    "    \n",
+    "    # Total up all none/zeroes \n",
+    "    df[\"total_percent_null_values\"] = df[['none_counts','zero_counts']].sum(axis=1)/len(df.columns) * 100\n",
+    "    \n",
+    "    # Count project descriptions\n",
+    "    df[\"project_description_count\"] = df[\"project_description\"].str.count('\\w+')\n",
+    "    \n",
+    "    # Categorize whether it has enough info or not\n",
+    "    df = categorize_info(df)\n",
+    "    \n",
+    "    # Compress columns to retain some info\n",
+    "    df['counts'] = 'number of strings in project desc: ' + df.project_description_count.astype(str) + ' % of null values:' + df.total_percent_null_values.astype(int).astype(str)\n",
+    "    \n",
+    "    df = df.drop(columns = ['none_counts','zero_counts','project_description_count','total_percent_null_values'])\n",
+    "    return df "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "91c4e4b0-f28d-4956-9274-d17a3306801e",
+   "metadata": {
+    "tags": []
+   },
+   "source": [
+    "#### Correct lead agencies again"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 123,
+   "id": "f0f564ce-5551-4750-94b6-bb7c5b056949",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def flip_county_city(df, agency_col:str):\n",
+    "    # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb\n",
+    "    to_correct = df[(df[agency_col].str.contains('County')) | (df[agency_col].str.contains('City'))]\n",
+    "    to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop = True)\n",
+    "    to_correct['str_len'] = to_correct[agency_col].str.split().str.len()\n",
+    "    to_correct = to_correct[to_correct.str_len <= 5 ].reset_index(drop = True)\n",
+    "    to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)\n",
+    "    to_correct['new_name'] = to_correct['name_pt2'] + ' ' + to_correct['name_pt1']\n",
+    "    \n",
+    "    new_names_dictionary = (dict(to_correct[[agency_col, 'new_name']].values))\n",
+    "    df['agency_corrected'] = df[agency_col].map(new_names_dictionary)\n",
+    "    df['agency_corrected'] = df['agency_corrected'].fillna(df[agency_col])\n",
+    "    \n",
+    "    df = df.drop(columns = [agency_col])\n",
+    "    df = df.rename(columns = {\"agency_corrected\":agency_col})\n",
+    "    \n",
+    "    return df "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "id": "3946f71e-f987-452b-8269-331d6cb461c7",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# all_projects_metric.lead_agency.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "0bd2a79a-700b-446e-8346-5aa6fb2309f8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_all_projects():\n",
+    "\n",
+    "    # Load  dataframes\n",
+    "    state_rail_plan = harmonize_srp()\n",
+    "    lost = harmonize_lost()\n",
+    "    sb1 = harmonize_sb1()\n",
+    "\n",
+    "    # Concat for df\n",
+    "    df = pd.concat([lost, state_rail_plan, sb1])\n",
+    "    \n",
+    "    # Clean agency names\n",
+    "    df = harmonization_utils.organization_cleaning(df, \"lead_agency\")\n",
+    "    df = flip_county_city(df, 'lead_agency')\n",
+    "    \n",
+    "    # Determine if the project completely funded or not?\n",
+    "    # Add up all available funds\n",
+    "    df[\"fully_funded\"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)\n",
+    "    \n",
+    "    # Does this project have enough info?\n",
+    "    df = enough_info(df)\n",
+    "    \n",
+    "    \n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "5dcafef7-30b9-4582-93c8-188ede6b8562",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:30: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
+      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:37: FutureWarning: The default value of regex will change from True to False in a future version.\n",
+      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:30: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
+      "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "/home/jovyan/data-analyses/project_list/_harmonization_utils.py:34: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n"
+     ]
+    }
+   ],
+   "source": [
+    "all_projects = add_all_projects()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "64c6a43d-0a8c-4f7c-a3cc-df3415163bf4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Index(['project_title', 'lead_agency', 'project_year', 'project_category',\n",
+       "       'grant_program', 'phase', 'project_description',\n",
+       "       'total_project_cost_(millions)', 'total_available_funds_(millions)',\n",
+       "       'unfunded_needs_(millions)', 'city', 'county', 'location', 'geometry',\n",
+       "       'data_source', 'notes', 'funding_notes', 'ct_district',\n",
+       "       'project_description2', 'fully_funded', 'enough_info', 'counts'],\n",
+       "      dtype='object')"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "78825d55-c1b0-447b-b33e-493c7165aa25",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_title</th>\n",
+       "      <th>lead_agency</th>\n",
+       "      <th>project_year</th>\n",
+       "      <th>project_category</th>\n",
+       "      <th>grant_program</th>\n",
+       "      <th>phase</th>\n",
+       "      <th>project_description</th>\n",
+       "      <th>total_project_cost_(millions)</th>\n",
+       "      <th>total_available_funds_(millions)</th>\n",
+       "      <th>unfunded_needs_(millions)</th>\n",
+       "      <th>city</th>\n",
+       "      <th>county</th>\n",
+       "      <th>location</th>\n",
+       "      <th>data_source</th>\n",
+       "      <th>notes</th>\n",
+       "      <th>funding_notes</th>\n",
+       "      <th>ct_district</th>\n",
+       "      <th>project_description2</th>\n",
+       "      <th>fully_funded</th>\n",
+       "      <th>enough_info</th>\n",
+       "      <th>counts</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>358</th>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>19/20, 20/21</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Sgr</td>\n",
+       "      <td>In Progress, Planned</td>\n",
+       "      <td>None</td>\n",
+       "      <td>0.12</td>\n",
+       "      <td>0.12</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>Corcoran</td>\n",
+       "      <td>Kings</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Sb1</td>\n",
+       "      <td>Iijaprogram: ,  Dateupdated: 2021-09-09,  Isonshs: N,  Isonshscodes: N,  Agencies: City Of Corcoran,  Popup: None</td>\n",
+       "      <td>Sb1Funds: 0.121909,  Iijafunds: 0.0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Fully funded</td>\n",
+       "      <td>No</td>\n",
+       "      <td>number of strings in project desc: 1 % of null values:40</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1085</th>\n",
+       "      <td>Spring Street Overlay</td>\n",
+       "      <td>City Of Signal Hill</td>\n",
+       "      <td>19/20</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Lsr</td>\n",
+       "      <td>Planned</td>\n",
+       "      <td>None</td>\n",
+       "      <td>3.00</td>\n",
+       "      <td>0.13</td>\n",
+       "      <td>2.87</td>\n",
+       "      <td>Signal Hill</td>\n",
+       "      <td>Los Angeles</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Sb1</td>\n",
+       "      <td>Iijaprogram: ,  Dateupdated: 6/30/2021,  Isonshs: N,  Isonshscodes: N,  Agencies: City Of Signal Hill,  Popup: None</td>\n",
+       "      <td>Sb1Funds: 0.126705,  Iijafunds: 0.0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Partially funded</td>\n",
+       "      <td>No</td>\n",
+       "      <td>number of strings in project desc: 1 % of null values:27</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2106</th>\n",
+       "      <td>Major Damage Restoration</td>\n",
+       "      <td>Caltrans</td>\n",
+       "      <td>20/21</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>In Progress</td>\n",
+       "      <td>A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.</td>\n",
+       "      <td>16.52</td>\n",
+       "      <td>9.08</td>\n",
+       "      <td>7.44</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Del Norte</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Sb1</td>\n",
+       "      <td>Iijaprogram: State Hwy Operations &amp; Protection Program Major-Federal,  Dateupdated: 2022-06-28,  Isonshs: None,  Isonshscodes: Y,  Agencies: Caltrans,  Popup: Major Damage Restorationbr</td>\n",
+       "      <td>Sb1Funds: 0.0,  Iijafunds: 9.083566</td>\n",
+       "      <td>01</td>\n",
+       "      <td>A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.</td>\n",
+       "      <td>Partially funded</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>number of strings in project desc: 25 % of null values:18</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                 project_title          lead_agency  project_year  \\\n",
+       "358                       None                 None  19/20, 20/21   \n",
+       "1085     Spring Street Overlay  City Of Signal Hill         19/20   \n",
+       "2106  Major Damage Restoration             Caltrans         20/21   \n",
+       "\n",
+       "     project_category grant_program                 phase  \\\n",
+       "358              None           Sgr  In Progress, Planned   \n",
+       "1085             None           Lsr               Planned   \n",
+       "2106             None         Shopp           In Progress   \n",
+       "\n",
+       "                                                                                                                                         project_description  \\\n",
+       "358                                                                                                                                                     None   \n",
+       "1085                                                                                                                                                    None   \n",
+       "2106  A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.   \n",
+       "\n",
+       "      total_project_cost_(millions)  total_available_funds_(millions)  \\\n",
+       "358                            0.12                              0.12   \n",
+       "1085                           3.00                              0.13   \n",
+       "2106                          16.52                              9.08   \n",
+       "\n",
+       "      unfunded_needs_(millions)         city       county location  \\\n",
+       "358                        0.00     Corcoran        Kings     None   \n",
+       "1085                       2.87  Signal Hill  Los Angeles     None   \n",
+       "2106                       7.44         None    Del Norte     None   \n",
+       "\n",
+       "     data_source  \\\n",
+       "358          Sb1   \n",
+       "1085         Sb1   \n",
+       "2106         Sb1   \n",
+       "\n",
+       "                                                                                                                                                                                          notes  \\\n",
+       "358                                                                           Iijaprogram: ,  Dateupdated: 2021-09-09,  Isonshs: N,  Isonshscodes: N,  Agencies: City Of Corcoran,  Popup: None   \n",
+       "1085                                                                        Iijaprogram: ,  Dateupdated: 6/30/2021,  Isonshs: N,  Isonshscodes: N,  Agencies: City Of Signal Hill,  Popup: None   \n",
+       "2106  Iijaprogram: State Hwy Operations & Protection Program Major-Federal,  Dateupdated: 2022-06-28,  Isonshs: None,  Isonshscodes: Y,  Agencies: Caltrans,  Popup: Major Damage Restorationbr   \n",
+       "\n",
+       "                            funding_notes ct_district  \\\n",
+       "358   Sb1Funds: 0.121909,  Iijafunds: 0.0        None   \n",
+       "1085  Sb1Funds: 0.126705,  Iijafunds: 0.0        None   \n",
+       "2106  Sb1Funds: 0.0,  Iijafunds: 9.083566          01   \n",
+       "\n",
+       "                                                                                                                                        project_description2  \\\n",
+       "358                                                                                                                                                     None   \n",
+       "1085                                                                                                                                                    None   \n",
+       "2106  A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.   \n",
+       "\n",
+       "          fully_funded enough_info  \\\n",
+       "358       Fully funded          No   \n",
+       "1085  Partially funded          No   \n",
+       "2106  Partially funded         Yes   \n",
+       "\n",
+       "                                                         counts  \n",
+       "358    number of strings in project desc: 1 % of null values:40  \n",
+       "1085   number of strings in project desc: 1 % of null values:27  \n",
+       "2106  number of strings in project desc: 25 % of null values:18  "
+      ]
+     },
+     "execution_count": 53,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects.drop(columns = ['geometry']).sample(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "0983ba29-f492-4a1a-ad40-78ebd291f7d2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Shopp                              1631\n",
+       "Imperial D 2008                     726\n",
+       "Hm                                  520\n",
+       "Lsr                                 285\n",
+       "State Rail Plan                     276\n",
+       "Atp                                 216\n",
+       "Sgr                                 156\n",
+       "Stip                                126\n",
+       "San Mateo W 2018                     91\n",
+       "Los Angeles Angeles M 2016           89\n",
+       "San Benito G 2004                    86\n",
+       "Santa Clara B 2016                   85\n",
+       "Tircp                                82\n",
+       "Shopa                                79\n",
+       "San Mateo A2 2006                    78\n",
+       "Alameda B 2000                       62\n",
+       "San Diego A 2004                     59\n",
+       "San Joaquin K 2003                   56\n",
+       "Tcep                                 55\n",
+       "San Bernardino I2 2018               51\n",
+       "Sacramento A2 2004                   51\n",
+       "Tulare R 2006                        49\n",
+       "Sta                                  49\n",
+       "Sonoma M 2004                        44\n",
+       "Alameda Bb 2014                      40\n",
+       "Lpp-F                                40\n",
+       "Santa Barbara A 2008                 37\n",
+       "Los Angeles Angeles R 2008           37\n",
+       "Madera T 2006                        36\n",
+       "Sccp                                 34\n",
+       "San Francisco K 2004                 28\n",
+       "Riverside A2 2006                    27\n",
+       "Lpp-C                                21\n",
+       "Stanislaus L 2016                    20\n",
+       "Contra Costa J 2004                  19\n",
+       "Orange M2 2002                       19\n",
+       "Santa Clara A 2000                   14\n",
+       "Sra                                  11\n",
+       "Monterey X 2016                      11\n",
+       "Santa Cruz D 2016                     9\n",
+       "Marin A 2004                          7\n",
+       "Monterey Salinas Transit Q 2016       6\n",
+       "Sonoma Q 2008                         5\n",
+       "Fresno C 2006                         5\n",
+       "Los Angelest Alameda Bb 2014          1\n",
+       "Santa Clara B 2008                    1\n",
+       "Name: grant_program, dtype: int64"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects.grant_program.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "301b9bde-499e-445d-a27c-f50f522e4aa9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Sb1                        3305\n",
+       "Local Options Sales Tax    1849\n",
+       "State Rail Plan             276\n",
+       "Name: data_source, dtype: int64"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects.data_source.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "0c066920-6b09-4584-bc82-4f88b41e00d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "0.00    20.06\n",
+       "0.33     2.65\n",
+       "0.25     1.25\n",
+       "7.61     0.85\n",
+       "17.86    0.77\n",
+       "Name: total_project_cost_(millions), dtype: float64"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects[\"total_project_cost_(millions)\"].value_counts().head() / len(all_projects) * 100"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "413ac763-c08b-48b0-91d5-6e53fd8f2c32",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "No available funding info    1963\n",
+       "Partially funded             1796\n",
+       "No project cost info         1089\n",
+       "Fully funded                  582\n",
+       "Name: fully_funded, dtype: int64"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects.fully_funded.value_counts()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8add5491-77d7-4eaa-ad79-57072f7eddd9",
+   "metadata": {},
+   "source": [
+    "### Metrics\n",
+    "* Rewrite to be shorter?\n",
+    "* Correct spelling of descriptions?\n",
+    "* https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "6d6253cd-b5f8-4431-a575-9a274e6e8bae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_categories(df):\n",
+    "    \"\"\"\n",
+    "    Create general categories for each projects.\n",
+    "    https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305\n",
+    "    \"\"\"\n",
+    "    # There are many projects that are \n",
+    "    ACTIVE_TRANSPORTATION = ['bike', 'bicycle', 'cyclist', \n",
+    "                             'pedestrian', \n",
+    "                             ## including the spelling errors of `pedestrian`\n",
+    "                             'pedestrain',\n",
+    "                             'crosswalk', \n",
+    "                             'bulb out', 'bulb-out', \n",
+    "                             'active transp', 'traffic reduction', \n",
+    "                             'speed reduction', 'ped', 'srts', \n",
+    "                             'safe routes to school',\n",
+    "                             'sidewalk', 'side walk', 'Cl ', 'trail',\n",
+    "                             'atp'\n",
+    "                            ]\n",
+    "    TRANSIT = ['bus', 'metro', 'station', #Station comes up a few times as a charging station and also as a train station\n",
+    "               'transit','fare', 'brt', 'yarts', 'railroad', 'highway-rail'\n",
+    "               # , 'station' in description and 'charging station' not in description\n",
+    "              ] \n",
+    "    BRIDGE = [\"bridge\", 'viaduct']\n",
+    "    STREET = ['traffic signal', 'resurface', 'resurfacing', 'slurry', 'seal' \n",
+    "              'sign', 'stripe', 'striping', 'median', \n",
+    "              'guard rail', 'guardrail', \n",
+    "              'road', 'street', \n",
+    "              'sinkhole', 'intersection', 'signal', 'curb',\n",
+    "              'light', 'tree', 'pavement', 'roundabout'\n",
+    "             ]\n",
+    "\n",
+    "    FREEWAY = ['hov ', 'hot ', 'freeway', 'highway', 'express lanes', 'hwy']\n",
+    "\n",
+    "    INFRA_RESILIENCY_ER = ['repair', 'emergency', 'replace','retrofit', 'er',\n",
+    "                           'rehab', 'improvements', 'seismic', 'reconstruct', 'restoration']\n",
+    "\n",
+    "    CONGESTION_RELIEF = ['congestion', 'rideshare','ridesharing', 'vanpool', 'car share']\n",
+    "\n",
+    "    NOT_INC = ['charging', 'fueling', 'cng', 'bridge', 'trail',\n",
+    "           'k-rail', 'guardrails', 'bridge rail', 'guard', 'guarrail']\n",
+    "    \n",
+    "    PASSENGER_MODE = ['non sov', 'high quality transit areas', \n",
+    "                      'hqta', 'hov']\n",
+    "    \n",
+    "    \n",
+    "    SAFETY = ['fatalities','safe', 'speed management','signal coordination',\n",
+    "              'slow speeds', 'roundabouts', 'victims','collisoins','protect',\n",
+    "              'crash', 'modification factors', 'safety system'] \n",
+    "    \n",
+    "    def categorize_project_descriptions(row):\n",
+    "        \"\"\"\n",
+    "        This function takes a individual type of work description (row of a dataframe)\n",
+    "        and returns a dummy flag of 1 if it finds keyword present in\n",
+    "        project categories (active transportation, transit, bridge, etc).\n",
+    "        A description can contain multiple keywords across categories.\n",
+    "        \"\"\"\n",
+    "        # Clean up project description 2\n",
+    "        project_description = (row.project_description2.lower()\n",
+    "                               .replace(\"-\",\"\")\n",
+    "                               .replace(\".\",\"\")\n",
+    "                               .replace(\":\",\"\")\n",
+    "                              )\n",
+    "    \n",
+    "        # Store a bunch of columns that will be flagged\n",
+    "        # A project can involve multiple things...also, not sure what's in the descriptions\n",
+    "        active_transp = \"\"\n",
+    "        transit = \"\"\n",
+    "        bridge =\"\"\n",
+    "        street = \"\"\n",
+    "        freeway = \"\"\n",
+    "        infra_resiliency_er = \"\"\n",
+    "        congestion_relief = \"\"\n",
+    "        passenger_mode_shift = \"\"\n",
+    "        safety = \"\"\n",
+    "        \n",
+    "        if any(word in project_description for word in ACTIVE_TRANSPORTATION):\n",
+    "            active_transp = \"active transportation\"\n",
+    "        \n",
+    "        #if any(word in description if instanceof(word, str) else word(description) for word in TRANSIT)\n",
+    "\n",
+    "        if (any(word in project_description for word in TRANSIT) and \n",
+    "            not any(exclude_word in project_description for exclude_word in NOT_INC)\n",
+    "           ):\n",
+    "            transit = \"transit\"\n",
+    "        if any(word in project_description for word in BRIDGE):\n",
+    "            bridge = \"bridge\"\n",
+    "        if any(word in project_description for word in STREET):\n",
+    "            street = \"street\"\n",
+    "        if any(word in project_description for word in FREEWAY):\n",
+    "            freeway = \"freeway\" \n",
+    "        if any(word in project_description for word in INFRA_RESILIENCY_ER):\n",
+    "            infra_resiliency_er = \"infrastructure\"\n",
+    "        if any(word in project_description for word in CONGESTION_RELIEF):\n",
+    "            congestion_relief = \"congestion relief\"    \n",
+    "        if any(word in project_description for word in PASSENGER_MODE):\n",
+    "            passenger_mode_shift = \"passenger mode shift\"    \n",
+    "        if any(word in project_description for word in SAFETY):\n",
+    "            safety = \"safety\"    \n",
+    "        return pd.Series(\n",
+    "            [active_transp, transit, bridge, street, freeway, infra_resiliency_er, congestion_relief,\n",
+    "            passenger_mode_shift, safety], \n",
+    "            index=['active_transp', 'transit', 'bridge', 'street', \n",
+    "                   'freeway', 'infra_resiliency_er', 'congestion_relief',\n",
+    "                  'passenger_mode_shift', 'safety']\n",
+    "        )\n",
+    "    \n",
+    "    \n",
+    "    work_categories = df.apply(categorize_project_descriptions, axis=1)\n",
+    "    work_cols = list(work_categories.columns)\n",
+    "    df2 = pd.concat([df, work_categories], axis=1)\n",
+    "    \n",
+    "    df2['categories'] = df2[work_cols].agg(' '.join, axis=1)\n",
+    "    df2['categories'] = df2['categories'].str.strip()\n",
+    "    df2 = df2.drop(columns = work_cols)\n",
+    "    \n",
+    "    return df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "6ea11daa-3a18-4d8a-9004-b2fc5e6d4343",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects_metric = add_categories(all_projects)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 67,
+   "id": "f3856c74-228d-4cf8-929a-cac486024586",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_title</th>\n",
+       "      <th>lead_agency</th>\n",
+       "      <th>project_year</th>\n",
+       "      <th>project_category</th>\n",
+       "      <th>grant_program</th>\n",
+       "      <th>phase</th>\n",
+       "      <th>project_description</th>\n",
+       "      <th>total_project_cost_(millions)</th>\n",
+       "      <th>total_available_funds_(millions)</th>\n",
+       "      <th>unfunded_needs_(millions)</th>\n",
+       "      <th>city</th>\n",
+       "      <th>county</th>\n",
+       "      <th>location</th>\n",
+       "      <th>data_source</th>\n",
+       "      <th>notes</th>\n",
+       "      <th>funding_notes</th>\n",
+       "      <th>ct_district</th>\n",
+       "      <th>project_description2</th>\n",
+       "      <th>fully_funded</th>\n",
+       "      <th>enough_info</th>\n",
+       "      <th>counts</th>\n",
+       "      <th>categories</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>1886</th>\n",
+       "      <td>Safety - Hm4</td>\n",
+       "      <td>Caltrans</td>\n",
+       "      <td>21/22</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Hm</td>\n",
+       "      <td>In Progress</td>\n",
+       "      <td>Maintain/Repair Transportaiton Management Systems</td>\n",
+       "      <td>0.20</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>0.20</td>\n",
+       "      <td>Visalia</td>\n",
+       "      <td>Tulare</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Sb1</td>\n",
+       "      <td>Iijaprogram: None,  Dateupdated: 2022-09-19,  Isonshs: None,  Isonshscodes: N,  Agencies: Caltrans,  Popup:</td>\n",
+       "      <td>Sb1Funds: 0.0,  Iijafunds: 0.0</td>\n",
+       "      <td>06</td>\n",
+       "      <td>Maintain/Repair Transportaiton Management Systems</td>\n",
+       "      <td>No available funding info</td>\n",
+       "      <td>Yes</td>\n",
+       "      <td>number of strings in project desc: 5 % of null values:22</td>\n",
+       "      <td>infrastructure</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1119</th>\n",
+       "      <td>Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Freeway Safety And Congestion Relief Program</td>\n",
+       "      <td>Sacramento A2 2004</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>47.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>47.00</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Sacramento</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Local Options Sales Tax</td>\n",
+       "      <td>Notes: No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category.</td>\n",
+       "      <td>Estimated Lost Funds: 0.0,  Estimated Federal Funds: 0.0,  Estimated State Funds: 0.0,  Estimated Local Funds: 0,  Estimated Other Funds: 0.0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S</td>\n",
+       "      <td>No available funding info</td>\n",
+       "      <td>No</td>\n",
+       "      <td>number of strings in project desc: 1 % of null values:40</td>\n",
+       "      <td>transit</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1589</th>\n",
+       "      <td>Highway 101: Betteravia Road Interchange</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Santa Barbara A 2008</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.</td>\n",
+       "      <td>2.00</td>\n",
+       "      <td>5.00</td>\n",
+       "      <td>-3.00</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Santa Barbara</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Local Options Sales Tax</td>\n",
+       "      <td>Notes: Nan</td>\n",
+       "      <td>Estimated Lost Funds: 2.0,  Estimated Federal Funds: 0.0,  Estimated State Funds: 0.0,  Estimated Local Funds: 0,  Estimated Other Funds: 3.0</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.</td>\n",
+       "      <td>Fully funded</td>\n",
+       "      <td>No</td>\n",
+       "      <td>number of strings in project desc: 24 % of null values:36</td>\n",
+       "      <td>street freeway infrastructure</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                            project_title lead_agency  \\\n",
+       "1886                                         Safety - Hm4    Caltrans   \n",
+       "1119  Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S        None   \n",
+       "1589             Highway 101: Betteravia Road Interchange        None   \n",
+       "\n",
+       "     project_year                              project_category  \\\n",
+       "1886        21/22                                          None   \n",
+       "1119         None  Freeway Safety And Congestion Relief Program   \n",
+       "1589         None                                          None   \n",
+       "\n",
+       "             grant_program        phase  \\\n",
+       "1886                    Hm  In Progress   \n",
+       "1119    Sacramento A2 2004         None   \n",
+       "1589  Santa Barbara A 2008         None   \n",
+       "\n",
+       "                                                                                                                                                          project_description  \\\n",
+       "1886                                                                                                                        Maintain/Repair Transportaiton Management Systems   \n",
+       "1119                                                                                                                                                                     None   \n",
+       "1589  Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.   \n",
+       "\n",
+       "      total_project_cost_(millions)  total_available_funds_(millions)  \\\n",
+       "1886                           0.20                              0.00   \n",
+       "1119                          47.00                              0.00   \n",
+       "1589                           2.00                              5.00   \n",
+       "\n",
+       "      unfunded_needs_(millions)     city         county location  \\\n",
+       "1886                       0.20  Visalia         Tulare     None   \n",
+       "1119                      47.00     None     Sacramento     None   \n",
+       "1589                      -3.00     None  Santa Barbara     None   \n",
+       "\n",
+       "                  data_source  \\\n",
+       "1886                      Sb1   \n",
+       "1119  Local Options Sales Tax   \n",
+       "1589  Local Options Sales Tax   \n",
+       "\n",
+       "                                                                                                                                      notes  \\\n",
+       "1886                            Iijaprogram: None,  Dateupdated: 2022-09-19,  Isonshs: None,  Isonshscodes: N,  Agencies: Caltrans,  Popup:   \n",
+       "1119  Notes: No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category.   \n",
+       "1589                                                                                                                             Notes: Nan   \n",
+       "\n",
+       "                                                                                                                                      funding_notes  \\\n",
+       "1886                                                                                                                 Sb1Funds: 0.0,  Iijafunds: 0.0   \n",
+       "1119  Estimated Lost Funds: 0.0,  Estimated Federal Funds: 0.0,  Estimated State Funds: 0.0,  Estimated Local Funds: 0,  Estimated Other Funds: 0.0   \n",
+       "1589  Estimated Lost Funds: 2.0,  Estimated Federal Funds: 0.0,  Estimated State Funds: 0.0,  Estimated Local Funds: 0,  Estimated Other Funds: 3.0   \n",
+       "\n",
+       "     ct_district  \\\n",
+       "1886          06   \n",
+       "1119        None   \n",
+       "1589        None   \n",
+       "\n",
+       "                                                                                                                                                         project_description2  \\\n",
+       "1886                                                                                                                        Maintain/Repair Transportaiton Management Systems   \n",
+       "1119                                                                                                                      Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S   \n",
+       "1589  Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.   \n",
+       "\n",
+       "                   fully_funded enough_info  \\\n",
+       "1886  No available funding info         Yes   \n",
+       "1119  No available funding info          No   \n",
+       "1589               Fully funded          No   \n",
+       "\n",
+       "                                                         counts  \\\n",
+       "1886   number of strings in project desc: 5 % of null values:22   \n",
+       "1119   number of strings in project desc: 1 % of null values:40   \n",
+       "1589  number of strings in project desc: 24 % of null values:36   \n",
+       "\n",
+       "                         categories  \n",
+       "1886                 infrastructure  \n",
+       "1119                        transit  \n",
+       "1589  street freeway infrastructure  "
+      ]
+     },
+     "execution_count": 67,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects_metric.drop(columns = ['geometry']).sample(3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "id": "ad99b589-1d78-4052-96ac-4617f0494544",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "infrastructure                                                    1436\n",
+       "                                                                  1381\n",
+       "street  infrastructure                                             739\n",
+       "street                                                             372\n",
+       "bridge   infrastructure                                            226\n",
+       "transit    infrastructure                                          201\n",
+       "active transportation   street  infrastructure                     106\n",
+       "transit                                                             75\n",
+       "street  infrastructure   safety                                     58\n",
+       "transit  street  infrastructure                                     52\n",
+       "freeway infrastructure                                              52\n",
+       "bridge street  infrastructure                                       45\n",
+       "bridge                                                              44\n",
+       "active transportation     infrastructure                            44\n",
+       "active transportation                                               42\n",
+       "street freeway infrastructure   safety                              41\n",
+       "street freeway infrastructure                                       37\n",
+       "infrastructure   safety                                             36\n",
+       "active transportation   street  infrastructure   safety             29\n",
+       "freeway infrastructure  passenger mode shift                        22\n",
+       "active transportation transit    infrastructure                     21\n",
+       "freeway                                                             20\n",
+       "freeway infrastructure   safety                                     18\n",
+       "active transportation transit  street  infrastructure               17\n",
+       "bridge street  infrastructure   safety                              14\n",
+       "bridge   infrastructure   safety                                    12\n",
+       "street  infrastructure congestion relief                            11\n",
+       "active transportation transit  street  infrastructure   safety      11\n",
+       "passenger mode shift                                                11\n",
+       "street     safety                                                   10\n",
+       "Name: categories, dtype: int64"
+      ]
+     },
+     "execution_count": 69,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects_metric.categories.value_counts().head(30)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 88,
+   "id": "270e8b35-cc6b-4461-835c-40c4b850916d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def apply_metrics(df):\n",
+    "    def categorize_metrics(row):\n",
+    "        categories = row.categories.lower()\n",
+    "        safety = \"\"\n",
+    "        passenger_mode_shift = \"\"\n",
+    "        infill_development = \"\"\n",
+    "        \n",
+    "        if any(word in categories for word in ['infrastructure','safety',]):\n",
+    "            safety = \"safety\"\n",
+    "        if any(word in categories for word in ['active transportation', 'passenger_mode_shift', \"congestion relief\"]):\n",
+    "            passenger_mode_shift = \"passenger_mode_shift\"\n",
+    "        if any(word in categories for word in ['transit', 'active transportation',]):\n",
+    "            infill_development = \"infill_development\" \n",
+    "       \n",
+    "        return pd.Series(\n",
+    "            [safety,passenger_mode_shift,infill_development], \n",
+    "            index=['safety', 'passenger_mode_shift', 'infill_development']\n",
+    "        )\n",
+    "    \n",
+    "    work_categories = df.apply(categorize_metrics, axis=1)\n",
+    "    work_cols = list(work_categories.columns)\n",
+    "    df2 = pd.concat([df, work_categories], axis=1)\n",
+    "    \n",
+    "    df2['applicable_metrics'] = df2[work_cols].agg(' '.join, axis=1)\n",
+    "    df2['applicable_metrics'] = df2['applicable_metrics'].str.strip()\n",
+    "    df2 = df2.drop(columns = work_cols)\n",
+    "    \n",
+    "    return df2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 89,
+   "id": "9a643de4-b6b3-4751-9a9f-b68abe4d7a22",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects_metric = apply_metrics(all_projects_metric)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 90,
+   "id": "a6da3b49-dd3f-4b01-b394-23f44bf8e3a6",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>grant_program</th>\n",
+       "      <th>project_description2</th>\n",
+       "      <th>categories</th>\n",
+       "      <th>applicable_metrics</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2587</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $4.91 Million Dollar Project In Santa Barbara County On Route 154 Will Place High Friction Surface Treatment (Hfst) And Construct Centerline Rumble Strip.</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1058</th>\n",
+       "      <td>Lsr</td>\n",
+       "      <td>None</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>17</th>\n",
+       "      <td>State Rail Plan</td>\n",
+       "      <td>Expansion Of The Smart Fleet To Accommodate Service Expansion.</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>845</th>\n",
+       "      <td>Imperial D 2008</td>\n",
+       "      <td>Overlay</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1933</th>\n",
+       "      <td>Hm</td>\n",
+       "      <td>Maintain/Repair Pavement - Seal Coat</td>\n",
+       "      <td>street  infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2032</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $11.57 Million Dollar Project In Humboldt County On Route 299 Will Widen Shoulders.</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>331</th>\n",
+       "      <td>Sgr</td>\n",
+       "      <td>None</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3222</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $5.8 Million Dollar Project In San Diego County On Route 5 Will Apply Polyester Concrete Overlay To Bridge Decks, Apply Methacrylate To Approach Slabs, And Repair Spalls. (Bridge Deck Preservation)</td>\n",
+       "      <td>bridge   infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>106</th>\n",
+       "      <td>State Rail Plan</td>\n",
+       "      <td>Double Track From Mp 436.65 To Cp Santa Susana To Allow At-Speed Meets At 437.4. Add 2Nd Platform At Simi Valley Station To Allow Boarding From Both Tracks.</td>\n",
+       "      <td>transit</td>\n",
+       "      <td>infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1092</th>\n",
+       "      <td>Lsr</td>\n",
+       "      <td>None</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1703</th>\n",
+       "      <td>Stip</td>\n",
+       "      <td>Near The City Of Tulare, On Route 65 From Lindsay To Exeter, And On Road 204 From Route 137 To Route 198.  Widen To 4 Divided Lanes And Realign Highway.</td>\n",
+       "      <td>street freeway infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>753</th>\n",
+       "      <td>Imperial D 2008</td>\n",
+       "      <td>Overlay</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>43</th>\n",
+       "      <td>State Rail Plan</td>\n",
+       "      <td>Caltrain Electrification Will Electrify The Corridor From San Francisco Caltrain Station To The Tamien Caltrain Station. Electrification Improvements Include Converting Diesel-Hauled Trains To Electric Trains, Increasing Service To Six Trains Per Peak Hour Per Direction, And Maintaining Operating Speed Up To 79 Mph.</td>\n",
+       "      <td>transit    infrastructure</td>\n",
+       "      <td>safety  infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>627</th>\n",
+       "      <td>Hm</td>\n",
+       "      <td>Maintain/Repair Transportaiton Management Systems</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>900</th>\n",
+       "      <td>Los Angeles Angeles M 2016</td>\n",
+       "      <td>Transportation System And Mobility Improve Program</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2016</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $7.4 Million Dollar Project In Mendocino County On Route 1 Will Widen For Standard Shoulders, Improve Roadway Cross Slope, And Install Rumble Strips And Guardrail.</td>\n",
+       "      <td>street  infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1759</th>\n",
+       "      <td>Hm</td>\n",
+       "      <td>Repair/Replace Culverts</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2082</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $24.63 Million Dollar Project In Mendocino County On Route 128 Will Rehabilitate Pavement, Upgrade Transportation Management System (Tms) Elements, Guardrails, And Sign Panels, Upgrade Facilities To Americans With Disabilities Act (Ada) Standards, And Construct Complete Streets Elements.</td>\n",
+       "      <td>street  infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1488</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $6.28 Million Dollar Project In San Bernardino County On Route Var Will Upgrade Traffic Census Stations.</td>\n",
+       "      <td>transit    infrastructure</td>\n",
+       "      <td>safety  infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1886</th>\n",
+       "      <td>Hm</td>\n",
+       "      <td>Maintain/Repair Transportaiton Management Systems</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>311</th>\n",
+       "      <td>Imperial D 2008</td>\n",
+       "      <td>Reconstruction</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1335</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $4.92 Million Dollar Project In San Mateo County On Route 82 Will Upgrade Bridge Rails And Signals And Upgrade Facilities To Americans With Disabilities Act (Ada) Standards.</td>\n",
+       "      <td>bridge street  infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>487</th>\n",
+       "      <td>Hm</td>\n",
+       "      <td>Maintain/Repair Maintenance Station</td>\n",
+       "      <td>transit    infrastructure</td>\n",
+       "      <td>safety  infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>471</th>\n",
+       "      <td>Sgr</td>\n",
+       "      <td>None</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>901</th>\n",
+       "      <td>Los Angeles Angeles M 2016</td>\n",
+       "      <td>Active Transportation 1St/Last Mile Connections Prog</td>\n",
+       "      <td>active transportation</td>\n",
+       "      <td>passenger_mode_shift infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>456</th>\n",
+       "      <td>Imperial D 2008</td>\n",
+       "      <td>Lincoln Ave From Rose Ave To Weakley St S</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1140</th>\n",
+       "      <td>Lsr</td>\n",
+       "      <td>None</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3275</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $36.09 Million Dollar Project In Orange County On Route 22 Will Install Safety Lighting And Upgrade Median Barrier, Drainage Systems, And Signs.</td>\n",
+       "      <td>street  infrastructure   safety</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3176</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $4.53 Million Dollar Project In San Diego County On Route 94 Will Construct And Upgrade Pedestrian Curb Ramps And Sidewalks To Meet Current Standards.</td>\n",
+       "      <td>active transportation   street</td>\n",
+       "      <td>passenger_mode_shift infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1540</th>\n",
+       "      <td>San Mateo W 2018</td>\n",
+       "      <td>Pedestrian Accessibility Improvements Citywide</td>\n",
+       "      <td>active transportation     infrastructure</td>\n",
+       "      <td>safety passenger_mode_shift infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1318</th>\n",
+       "      <td>San Diego A 2004</td>\n",
+       "      <td>8F+2Hov</td>\n",
+       "      <td>passenger mode shift</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3135</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $2.87 Million Dollar Project In San Joaquin County On Route 99 Will Apply High Friction Surface Treatment (Hfst).</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>320</th>\n",
+       "      <td>Imperial D 2008</td>\n",
+       "      <td>Overlay</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1633</th>\n",
+       "      <td>Tcep</td>\n",
+       "      <td>In San Bernardino And Riverside Counties Through The Cities Of Eastvale, Jurupa Valley, Ontario, And Rancho Cucamonga, On I-15 From Cantu Galleano Road To Foothill Boulevard. This Project Will Construct The Toll System  Needed To Operate The I-15 Express Lanes Project (08-0167M).</td>\n",
+       "      <td>street freeway infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1376</th>\n",
+       "      <td>San Joaquin K 2003</td>\n",
+       "      <td>Widen From 2 To 4 Lanes Between I-5 And The Lodi City Limits</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3036</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $4.57 Million Dollar Project In Riverside County On Route 60 Will Reduce Wrong-Way Collisions By Installing Wrong-Way Pavement Markers And Sign Panels, And Upgrading Pavement Markings At Onramps And Offramps.</td>\n",
+       "      <td>street  infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>826</th>\n",
+       "      <td>Imperial D 2008</td>\n",
+       "      <td>Reconstruct</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>340</th>\n",
+       "      <td>Imperial D 2008</td>\n",
+       "      <td>Crack Seal/Slurry Coat</td>\n",
+       "      <td>street</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2405</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $7.14 Million Dollar Project In Santa Clara County On Route 152 Will Rehabilitate Drainage Systems, Upgrade Guardrail, And Pave Roadside Areas To Prevent Vegetation Growth And Enhance Highway Worker Safety.</td>\n",
+       "      <td>street freeway infrastructure   safety</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>814</th>\n",
+       "      <td>Imperial D 2008</td>\n",
+       "      <td>Overlay</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>44</th>\n",
+       "      <td>Tircp</td>\n",
+       "      <td>None</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1041</th>\n",
+       "      <td>Lsr</td>\n",
+       "      <td>None</td>\n",
+       "      <td></td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>134</th>\n",
+       "      <td>State Rail Plan</td>\n",
+       "      <td>Design And Construct A New Station And Platform In The Oakley Civic Center On The San Joaquins Route Between Oakland And Stockton.  This Station Is Five Miles From The Existing Antioch/Pitsburg Station And Will Serve The Communities Of Oakley And Brentwood.</td>\n",
+       "      <td>transit    infrastructure</td>\n",
+       "      <td>safety  infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2009</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $104.39 Million Dollar Project In Del Norte County On Route 101 Will Replace Bridge.</td>\n",
+       "      <td>bridge   infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>12</th>\n",
+       "      <td>Alameda B 2000</td>\n",
+       "      <td>I-580 Interchange Improvements In Castro Valley</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2107</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $9.16 Million Dollar Project In Mendocino County On Route 020 Will Safety Improvements.</td>\n",
+       "      <td>infrastructure   safety</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1371</th>\n",
+       "      <td>Shopp</td>\n",
+       "      <td>A $9.7 Million Dollar Project In Santa Barbara County On Route 154 Will Replace Bridge.</td>\n",
+       "      <td>bridge   infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>973</th>\n",
+       "      <td>Los Angeles Angeles R 2008</td>\n",
+       "      <td>Bus Operations (Countywide Bus Service Operations,\\nMaintenance, And Expansion. Suspend A Scheduled\\nJuly 1, 2009 Metro Fare Increase For One Year And\\nFreeze All Metro Student, Senior, Disabled, And\\nMedicare Fares Through June 30, 2013 By Instead\\nUsing Metro'S Formula Allocation Procedure Share Of\\nThis Subfund.)</td>\n",
+       "      <td>transit    infrastructure</td>\n",
+       "      <td>safety  infill_development</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1823</th>\n",
+       "      <td>Tulare R 2006</td>\n",
+       "      <td>Over Crossing</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>97</th>\n",
+       "      <td>State Rail Plan</td>\n",
+       "      <td>Double Track Between Cp Canyon (Newhall Siding)  And Cp Hood (Canyon) To Allow 15- Minute Service Between La And Santa Clarita.</td>\n",
+       "      <td>infrastructure</td>\n",
+       "      <td>safety</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                   grant_program  \\\n",
+       "2587                       Shopp   \n",
+       "1058                         Lsr   \n",
+       "17               State Rail Plan   \n",
+       "845              Imperial D 2008   \n",
+       "1933                          Hm   \n",
+       "2032                       Shopp   \n",
+       "331                          Sgr   \n",
+       "3222                       Shopp   \n",
+       "106              State Rail Plan   \n",
+       "1092                         Lsr   \n",
+       "1703                        Stip   \n",
+       "753              Imperial D 2008   \n",
+       "43               State Rail Plan   \n",
+       "627                           Hm   \n",
+       "900   Los Angeles Angeles M 2016   \n",
+       "2016                       Shopp   \n",
+       "1759                          Hm   \n",
+       "2082                       Shopp   \n",
+       "1488                       Shopp   \n",
+       "1886                          Hm   \n",
+       "311              Imperial D 2008   \n",
+       "1335                       Shopp   \n",
+       "487                           Hm   \n",
+       "471                          Sgr   \n",
+       "901   Los Angeles Angeles M 2016   \n",
+       "456              Imperial D 2008   \n",
+       "1140                         Lsr   \n",
+       "3275                       Shopp   \n",
+       "3176                       Shopp   \n",
+       "1540            San Mateo W 2018   \n",
+       "1318            San Diego A 2004   \n",
+       "3135                       Shopp   \n",
+       "320              Imperial D 2008   \n",
+       "1633                        Tcep   \n",
+       "1376          San Joaquin K 2003   \n",
+       "3036                       Shopp   \n",
+       "826              Imperial D 2008   \n",
+       "340              Imperial D 2008   \n",
+       "2405                       Shopp   \n",
+       "814              Imperial D 2008   \n",
+       "44                         Tircp   \n",
+       "1041                         Lsr   \n",
+       "134              State Rail Plan   \n",
+       "2009                       Shopp   \n",
+       "12                Alameda B 2000   \n",
+       "2107                       Shopp   \n",
+       "1371                       Shopp   \n",
+       "973   Los Angeles Angeles R 2008   \n",
+       "1823               Tulare R 2006   \n",
+       "97               State Rail Plan   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                               project_description2  \\\n",
+       "2587                                                                                                                                                                   A $4.91 Million Dollar Project In Santa Barbara County On Route 154 Will Place High Friction Surface Treatment (Hfst) And Construct Centerline Rumble Strip.   \n",
+       "1058                                                                                                                                                                                                                                                                                                                           None   \n",
+       "17                                                                                                                                                                                                                                                                   Expansion Of The Smart Fleet To Accommodate Service Expansion.   \n",
+       "845                                                                                                                                                                                                                                                                                                                         Overlay   \n",
+       "1933                                                                                                                                                                                                                                                                                           Maintain/Repair Pavement - Seal Coat   \n",
+       "2032                                                                                                                                                                                                                                          A $11.57 Million Dollar Project In Humboldt County On Route 299 Will Widen Shoulders.   \n",
+       "331                                                                                                                                                                                                                                                                                                                            None   \n",
+       "3222                                                                                                                        A $5.8 Million Dollar Project In San Diego County On Route 5 Will Apply Polyester Concrete Overlay To Bridge Decks, Apply Methacrylate To Approach Slabs, And Repair Spalls. (Bridge Deck Preservation)   \n",
+       "106                                                                                                                                                                    Double Track From Mp 436.65 To Cp Santa Susana To Allow At-Speed Meets At 437.4. Add 2Nd Platform At Simi Valley Station To Allow Boarding From Both Tracks.   \n",
+       "1092                                                                                                                                                                                                                                                                                                                           None   \n",
+       "1703                                                                                                                                                                       Near The City Of Tulare, On Route 65 From Lindsay To Exeter, And On Road 204 From Route 137 To Route 198.  Widen To 4 Divided Lanes And Realign Highway.   \n",
+       "753                                                                                                                                                                                                                                                                                                                         Overlay   \n",
+       "43    Caltrain Electrification Will Electrify The Corridor From San Francisco Caltrain Station To The Tamien Caltrain Station. Electrification Improvements Include Converting Diesel-Hauled Trains To Electric Trains, Increasing Service To Six Trains Per Peak Hour Per Direction, And Maintaining Operating Speed Up To 79 Mph.   \n",
+       "627                                                                                                                                                                                                                                                                               Maintain/Repair Transportaiton Management Systems   \n",
+       "900                                                                                                                                                                                                                                                                              Transportation System And Mobility Improve Program   \n",
+       "2016                                                                                                                                                          A $7.4 Million Dollar Project In Mendocino County On Route 1 Will Widen For Standard Shoulders, Improve Roadway Cross Slope, And Install Rumble Strips And Guardrail.   \n",
+       "1759                                                                                                                                                                                                                                                                                                        Repair/Replace Culverts   \n",
+       "2082                             A $24.63 Million Dollar Project In Mendocino County On Route 128 Will Rehabilitate Pavement, Upgrade Transportation Management System (Tms) Elements, Guardrails, And Sign Panels, Upgrade Facilities To Americans With Disabilities Act (Ada) Standards, And Construct Complete Streets Elements.   \n",
+       "1488                                                                                                                                                                                                                     A $6.28 Million Dollar Project In San Bernardino County On Route Var Will Upgrade Traffic Census Stations.   \n",
+       "1886                                                                                                                                                                                                                                                                              Maintain/Repair Transportaiton Management Systems   \n",
+       "311                                                                                                                                                                                                                                                                                                                  Reconstruction   \n",
+       "1335                                                                                                                                                A $4.92 Million Dollar Project In San Mateo County On Route 82 Will Upgrade Bridge Rails And Signals And Upgrade Facilities To Americans With Disabilities Act (Ada) Standards.   \n",
+       "487                                                                                                                                                                                                                                                                                             Maintain/Repair Maintenance Station   \n",
+       "471                                                                                                                                                                                                                                                                                                                            None   \n",
+       "901                                                                                                                                                                                                                                                                            Active Transportation 1St/Last Mile Connections Prog   \n",
+       "456                                                                                                                                                                                                                                                                                       Lincoln Ave From Rose Ave To Weakley St S   \n",
+       "1140                                                                                                                                                                                                                                                                                                                           None   \n",
+       "3275                                                                                                                                                                             A $36.09 Million Dollar Project In Orange County On Route 22 Will Install Safety Lighting And Upgrade Median Barrier, Drainage Systems, And Signs.   \n",
+       "3176                                                                                                                                                                       A $4.53 Million Dollar Project In San Diego County On Route 94 Will Construct And Upgrade Pedestrian Curb Ramps And Sidewalks To Meet Current Standards.   \n",
+       "1540                                                                                                                                                                                                                                                                                 Pedestrian Accessibility Improvements Citywide   \n",
+       "1318                                                                                                                                                                                                                                                                                                                        8F+2Hov   \n",
+       "3135                                                                                                                                                                                                            A $2.87 Million Dollar Project In San Joaquin County On Route 99 Will Apply High Friction Surface Treatment (Hfst).   \n",
+       "320                                                                                                                                                                                                                                                                                                                         Overlay   \n",
+       "1633                                       In San Bernardino And Riverside Counties Through The Cities Of Eastvale, Jurupa Valley, Ontario, And Rancho Cucamonga, On I-15 From Cantu Galleano Road To Foothill Boulevard. This Project Will Construct The Toll System  Needed To Operate The I-15 Express Lanes Project (08-0167M).   \n",
+       "1376                                                                                                                                                                                                                                                                   Widen From 2 To 4 Lanes Between I-5 And The Lodi City Limits   \n",
+       "3036                                                                                                             A $4.57 Million Dollar Project In Riverside County On Route 60 Will Reduce Wrong-Way Collisions By Installing Wrong-Way Pavement Markers And Sign Panels, And Upgrading Pavement Markings At Onramps And Offramps.   \n",
+       "826                                                                                                                                                                                                                                                                                                                     Reconstruct   \n",
+       "340                                                                                                                                                                                                                                                                                                          Crack Seal/Slurry Coat   \n",
+       "2405                                                                                                               A $7.14 Million Dollar Project In Santa Clara County On Route 152 Will Rehabilitate Drainage Systems, Upgrade Guardrail, And Pave Roadside Areas To Prevent Vegetation Growth And Enhance Highway Worker Safety.   \n",
+       "814                                                                                                                                                                                                                                                                                                                         Overlay   \n",
+       "44                                                                                                                                                                                                                                                                                                                             None   \n",
+       "1041                                                                                                                                                                                                                                                                                                                           None   \n",
+       "134                                                               Design And Construct A New Station And Platform In The Oakley Civic Center On The San Joaquins Route Between Oakland And Stockton.  This Station Is Five Miles From The Existing Antioch/Pitsburg Station And Will Serve The Communities Of Oakley And Brentwood.   \n",
+       "2009                                                                                                                                                                                                                                         A $104.39 Million Dollar Project In Del Norte County On Route 101 Will Replace Bridge.   \n",
+       "12                                                                                                                                                                                                                                                                                  I-580 Interchange Improvements In Castro Valley   \n",
+       "2107                                                                                                                                                                                                                                      A $9.16 Million Dollar Project In Mendocino County On Route 020 Will Safety Improvements.   \n",
+       "1371                                                                                                                                                                                                                                        A $9.7 Million Dollar Project In Santa Barbara County On Route 154 Will Replace Bridge.   \n",
+       "973   Bus Operations (Countywide Bus Service Operations,\\nMaintenance, And Expansion. Suspend A Scheduled\\nJuly 1, 2009 Metro Fare Increase For One Year And\\nFreeze All Metro Student, Senior, Disabled, And\\nMedicare Fares Through June 30, 2013 By Instead\\nUsing Metro'S Formula Allocation Procedure Share Of\\nThis Subfund.)   \n",
+       "1823                                                                                                                                                                                                                                                                                                                  Over Crossing   \n",
+       "97                                                                                                                                                                                                  Double Track Between Cp Canyon (Newhall Siding)  And Cp Hood (Canyon) To Allow 15- Minute Service Between La And Santa Clarita.   \n",
+       "\n",
+       "                                    categories  \\\n",
+       "2587                            infrastructure   \n",
+       "1058                                             \n",
+       "17                              infrastructure   \n",
+       "845                             infrastructure   \n",
+       "1933                    street  infrastructure   \n",
+       "2032                            infrastructure   \n",
+       "331                                              \n",
+       "3222                   bridge   infrastructure   \n",
+       "106                                    transit   \n",
+       "1092                                             \n",
+       "1703             street freeway infrastructure   \n",
+       "753                             infrastructure   \n",
+       "43                   transit    infrastructure   \n",
+       "627                             infrastructure   \n",
+       "900                                              \n",
+       "2016                    street  infrastructure   \n",
+       "1759                            infrastructure   \n",
+       "2082                    street  infrastructure   \n",
+       "1488                 transit    infrastructure   \n",
+       "1886                            infrastructure   \n",
+       "311                             infrastructure   \n",
+       "1335             bridge street  infrastructure   \n",
+       "487                  transit    infrastructure   \n",
+       "471                                              \n",
+       "901                      active transportation   \n",
+       "456                                              \n",
+       "1140                                             \n",
+       "3275           street  infrastructure   safety   \n",
+       "3176            active transportation   street   \n",
+       "1540  active transportation     infrastructure   \n",
+       "1318                      passenger mode shift   \n",
+       "3135                                             \n",
+       "320                             infrastructure   \n",
+       "1633             street freeway infrastructure   \n",
+       "1376                                             \n",
+       "3036                    street  infrastructure   \n",
+       "826                             infrastructure   \n",
+       "340                                     street   \n",
+       "2405    street freeway infrastructure   safety   \n",
+       "814                             infrastructure   \n",
+       "44                                               \n",
+       "1041                                             \n",
+       "134                  transit    infrastructure   \n",
+       "2009                   bridge   infrastructure   \n",
+       "12                              infrastructure   \n",
+       "2107                   infrastructure   safety   \n",
+       "1371                   bridge   infrastructure   \n",
+       "973                  transit    infrastructure   \n",
+       "1823                            infrastructure   \n",
+       "97                              infrastructure   \n",
+       "\n",
+       "                                  applicable_metrics  \n",
+       "2587                                          safety  \n",
+       "1058                                                  \n",
+       "17                                            safety  \n",
+       "845                                           safety  \n",
+       "1933                                          safety  \n",
+       "2032                                          safety  \n",
+       "331                                                   \n",
+       "3222                                          safety  \n",
+       "106                               infill_development  \n",
+       "1092                                                  \n",
+       "1703                                          safety  \n",
+       "753                                           safety  \n",
+       "43                        safety  infill_development  \n",
+       "627                                           safety  \n",
+       "900                                                   \n",
+       "2016                                          safety  \n",
+       "1759                                          safety  \n",
+       "2082                                          safety  \n",
+       "1488                      safety  infill_development  \n",
+       "1886                                          safety  \n",
+       "311                                           safety  \n",
+       "1335                                          safety  \n",
+       "487                       safety  infill_development  \n",
+       "471                                                   \n",
+       "901          passenger_mode_shift infill_development  \n",
+       "456                                                   \n",
+       "1140                                                  \n",
+       "3275                                          safety  \n",
+       "3176         passenger_mode_shift infill_development  \n",
+       "1540  safety passenger_mode_shift infill_development  \n",
+       "1318                                                  \n",
+       "3135                                                  \n",
+       "320                                           safety  \n",
+       "1633                                          safety  \n",
+       "1376                                                  \n",
+       "3036                                          safety  \n",
+       "826                                           safety  \n",
+       "340                                                   \n",
+       "2405                                          safety  \n",
+       "814                                           safety  \n",
+       "44                                                    \n",
+       "1041                                                  \n",
+       "134                       safety  infill_development  \n",
+       "2009                                          safety  \n",
+       "12                                            safety  \n",
+       "2107                                          safety  \n",
+       "1371                                          safety  \n",
+       "973                       safety  infill_development  \n",
+       "1823                                          safety  \n",
+       "97                                            safety  "
+      ]
+     },
+     "execution_count": 90,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects_metric[['grant_program','project_description2','categories','applicable_metrics']].sample(50)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 86,
+   "id": "9ea21916-dd50-4396-850b-87ea2535c9f4",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "7"
+      ]
+     },
+     "execution_count": 86,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "all_projects_metric.applicable_metrics.nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ad5b6ae-9407-46ae-b2ff-c9ad6cbea83c",
+   "metadata": {},
+   "source": [
+    "### Categorization"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "f9a86d28-9b77-48c3-ba6d-27dc360f2fd0",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_list_of_words(df, col: str) -> list:\n",
+    "    \"\"\"\n",
+    "    Natalie's function to clean and place words in a project description column\n",
+    "    into a list\n",
+    "    \"\"\"\n",
+    "    # get just the one col\n",
+    "    column = df[[col]]\n",
+    "\n",
+    "    # remove single-dimensional entries from the shape of an array\n",
+    "    col_text = column.squeeze()\n",
+    "    # get list of words\n",
+    "    text_list = col_text.tolist()\n",
+    "\n",
+    "    # Join all the column into one large text blob, lower text\n",
+    "    text_list = \" \".join(text_list).lower()\n",
+    "\n",
+    "    # remove punctuation\n",
+    "    text_list = re.sub(r\"[^\\w\\s]\", \"\", text_list)\n",
+    "\n",
+    "    # List of stopwords\n",
+    "    swords = [re.sub(r\"[^A-z\\s]\", \"\", sword) for sword in stopwords.words(\"english\")]\n",
+    "\n",
+    "    # Remove stopwords\n",
+    "    clean_text_list = [\n",
+    "        word for word in word_tokenize(text_list.lower()) if word not in swords\n",
+    "    ]\n",
+    "\n",
+    "    return clean_text_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "cd602787-2444-49c5-8bb8-c59a63975de5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def find_common_phrases(df, description_column: str, values_to_add: list):\n",
+    "\n",
+    "    # Break apart every word in the description column into a list\n",
+    "    descriptions_list = get_list_of_words(df, description_column)\n",
+    "\n",
+    "    # Get phrases of whatever length you want (2,3,4,etc)\n",
+    "    c = Counter([\" \".join(y) for x in [2] for y in ngrams(descriptions_list, x)])\n",
+    "\n",
+    "    # Make a dataframe out of the counter values\n",
+    "    df_phrases = pd.DataFrame({\"phrases\": list(c.keys()), \"total\": list(c.values())})\n",
+    "\n",
+    "    # Take phrases that are repeated more than 40 times and turn it into a list\n",
+    "    df_phrases = ((df_phrases.loc[df_phrases[\"total\"] > 40])).reset_index(drop=True)\n",
+    "    common_phrases_list = df_phrases.phrases.tolist()\n",
+    "\n",
+    "    phrases_to_del = [\n",
+    "        \"san bernardino\",\n",
+    "        \"los angeles\",\n",
+    "        \"contra costa\",\n",
+    "        \"el dorado\",\n",
+    "        \"san luis obispo\",\n",
+    "        \"luis obispo\",\n",
+    "        \"del norte\",\n",
+    "        \"san francisco\",\n",
+    "        \"improve approximately\",\n",
+    "    ]\n",
+    "\n",
+    "    common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))\n",
+    "\n",
+    "    # CLean up the list to delete county information/etc\n",
+    "    words_to_delete = [\n",
+    "        \"county\",\n",
+    "        \"route\",\n",
+    "        \"dollar\",\n",
+    "        \"mile\",\n",
+    "        \"santa\",\n",
+    "        \"project\",\n",
+    "        \"san\",\n",
+    "        \"lanes\",\n",
+    "        \"lane\",\n",
+    "        \"2\",\n",
+    "        \"4\",\n",
+    "        \"financial\",\n",
+    "        \"prop\",\n",
+    "        \"best\",\n",
+    "        \"approximately\",\n",
+    "    ]\n",
+    "\n",
+    "    for word in words_to_delete:\n",
+    "        common_phrases_list = [x for x in common_phrases_list if word not in x]\n",
+    "\n",
+    "    # ADD certain keywords here\n",
+    "    # Operating Additional Service\n",
+    "    common_phrases_list.extend(values_to_add)\n",
+    "\n",
+    "    return common_phrases_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "id": "ec139873-4bb7-4428-9fd7-ceb9e247d4a3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def categorize_projects(\n",
+    "    df,\n",
+    "    description_column: str,\n",
+    "    project_id_column: str,\n",
+    "    title_column: str,\n",
+    "    values_to_add: list,\n",
+    "):\n",
+    "\n",
+    "    # Find most common 2 word phrases for some automatic project categories\n",
+    "    common_phrases_list = find_common_phrases(df, description_column, values_to_add)\n",
+    "\n",
+    "    # Place all the words in common_phrases_list into a blob named query\n",
+    "    # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa\n",
+    "    query = \"|\".join(common_phrases_list)\n",
+    "\n",
+    "    # Remove punctation and lower strings in original description column befores searching\n",
+    "    df[\"clean_description\"] = (\n",
+    "        df[description_column]\n",
+    "        .str.lower()\n",
+    "        .str.replace(\"-\", \" \", regex=True)\n",
+    "        .str.replace(\"(\", \" \", regex=True)\n",
+    "        .str.replace(\")\", \" \", regex=True)\n",
+    "        .str.replace(\".\", \" \", regex=True)\n",
+    "        .str.strip()\n",
+    "    )\n",
+    "\n",
+    "    # Search through description column for the most common phrases\n",
+    "    # Input the results in the new column\n",
+    "    df[\"auto_project_category\"] = df[\"clean_description\"].str.findall(\n",
+    "        r\"\\b({})\\b\".format(query)\n",
+    "    )\n",
+    "\n",
+    "    # Explode to take categories out of a list\n",
+    "    # Drop duplicate project keywords by title\n",
+    "    df = (\n",
+    "        df.explode(\"auto_project_category\")\n",
+    "        .sort_values([project_id_column, title_column])\n",
+    "        .drop_duplicates(\n",
+    "            subset=[\n",
+    "                description_column,\n",
+    "                project_id_column,\n",
+    "                title_column,\n",
+    "                \"auto_project_category\",\n",
+    "            ]\n",
+    "        )\n",
+    "    )\n",
+    "\n",
+    "    # Fill any uncategorized projects as \"Other\"\n",
+    "    df[\"auto_project_category\"] = (\n",
+    "        df[\"auto_project_category\"].fillna(\"Other\").str.title()\n",
+    "    )\n",
+    "\n",
+    "    # Correct spelling\n",
+    "    spell = Speller(lang=\"en\")\n",
+    "    df[\"auto_project_category\"] = df[\"auto_project_category\"].apply(\n",
+    "        lambda x: \" \".join([spell(i) for i in x.split()])\n",
+    "    )\n",
+    "\n",
+    "    # Summarize - put all the categories onto one line\n",
+    "    df = (\n",
+    "        df.groupby(\n",
+    "            [\n",
+    "                description_column,\n",
+    "                project_id_column,\n",
+    "                title_column,\n",
+    "            ]\n",
+    "        )[\"auto_project_category\"]\n",
+    "        .apply(\",\".join)\n",
+    "        .reset_index()\n",
+    "    )\n",
+    "\n",
+    "    return df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "d123f3b9-da23-4d4d-a2e2-dc3769100171",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def add_all_projects2():\n",
+    "\n",
+    "    # Load  dataframes\n",
+    "    state_rail_plan = harmonize_srp()\n",
+    "    lost = harominze_lost()\n",
+    "    sb1 = harmonize_sb1()\n",
+    "\n",
+    "    # Concat for df\n",
+    "    all_projects_df = pd.concat([lost, state_rail_plan, sb1])\n",
+    "\n",
+    "    # Categorize\n",
+    "    categories = categorize_projects(\n",
+    "        all_projects_df,\n",
+    "        \"project_description\",\n",
+    "        \"project_title\",\n",
+    "        \"project_id\",\n",
+    "        [\n",
+    "            \"operating\",\n",
+    "            \"service\",\n",
+    "            \"zero emission vehicle\",\n",
+    "            \"zev\",\n",
+    "            \"maintain/repair\",\n",
+    "            \"repair/replace\",\n",
+    "        ],\n",
+    "    )\n",
+    "\n",
+    "    # Merge categorized\n",
+    "    all_projects_df = pd.merge(\n",
+    "        all_projects_df.drop(columns=[\"clean_description\"]),\n",
+    "        categories,\n",
+    "        how=\"left\",\n",
+    "        on=[\"project_description\", \"project_title\", \"project_id\"],\n",
+    "    )\n",
+    "\n",
+    "    # Rename\n",
+    "    all_projects_df = all_projects_df.drop(columns=[\"auto_project_category_x\"]).rename(\n",
+    "        columns={\"auto_project_category_y\": \"auto_tagged_project_categories\"}\n",
+    "    )\n",
+    "    # Concat for gdf\n",
+    "    all_projects_gdf = pd.concat([sb1])\n",
+    "    all_projects_gdf = all_projects_gdf.set_geometry(\"location\")\n",
+    "\n",
+    "    return all_projects_df, all_projects_gdf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "87a29e05-0ba6-40cb-93e2-d097159e6235",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# all_projects, all_projects_geo = add_all_projects()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "873a88ca-5a47-4bfe-a1d3-715a5bed05bb",
+   "metadata": {
+    "scrolled": true,
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "85cfedf8-14aa-4d6c-b30e-cc9f6ee5bbf8",
+   "metadata": {},
+   "source": [
+    "### Look at the data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "id": "a7e39b78-af8b-4bc5-8911-572839a72b36",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "ename": "KeyError",
+     "evalue": "\"Column(s) ['project_id'] do not exist\"",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[51], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mall_projects\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlead_agency\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mproject_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnunique\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39msort_values(\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproject_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, ascending\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m      3\u001b[0m )\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m10\u001b[39m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/groupby/generic.py:895\u001b[0m, in \u001b[0;36mDataFrameGroupBy.aggregate\u001b[0;34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[0m\n\u001b[1;32m    892\u001b[0m func \u001b[38;5;241m=\u001b[39m maybe_mangle_lambdas(func)\n\u001b[1;32m    894\u001b[0m op \u001b[38;5;241m=\u001b[39m GroupByApply(\u001b[38;5;28mself\u001b[39m, func, args, kwargs)\n\u001b[0;32m--> 895\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    896\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dict_like(func) \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    897\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:172\u001b[0m, in \u001b[0;36mApply.agg\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    169\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_str()\n\u001b[1;32m    171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_dict_like(arg):\n\u001b[0;32m--> 172\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    173\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(arg):\n\u001b[1;32m    174\u001b[0m     \u001b[38;5;66;03m# we require a list, but not a 'str'\u001b[39;00m\n\u001b[1;32m    175\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39magg_list_like()\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:496\u001b[0m, in \u001b[0;36mApply.agg_dict_like\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    493\u001b[0m     selected_obj \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_selected_obj\n\u001b[1;32m    494\u001b[0m     selection \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_selection\n\u001b[0;32m--> 496\u001b[0m arg \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnormalize_dictlike_arg\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43magg\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m selected_obj\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m    499\u001b[0m     \u001b[38;5;66;03m# key only used for output\u001b[39;00m\n\u001b[1;32m    500\u001b[0m     colg \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_gotitem(selection, ndim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
+      "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:619\u001b[0m, in \u001b[0;36mApply.normalize_dictlike_arg\u001b[0;34m(self, how, obj, func)\u001b[0m\n\u001b[1;32m    617\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(cols) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m    618\u001b[0m         cols_sorted \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(safe_sort(\u001b[38;5;28mlist\u001b[39m(cols)))\n\u001b[0;32m--> 619\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcols_sorted\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m do not exist\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    621\u001b[0m aggregator_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mdict\u001b[39m)\n\u001b[1;32m    623\u001b[0m \u001b[38;5;66;03m# if we have a dict of any non-scalars\u001b[39;00m\n\u001b[1;32m    624\u001b[0m \u001b[38;5;66;03m# eg. {'A' : ['mean']}, normalize all to\u001b[39;00m\n\u001b[1;32m    625\u001b[0m \u001b[38;5;66;03m# be list-likes\u001b[39;00m\n\u001b[1;32m    626\u001b[0m \u001b[38;5;66;03m# Cannot use func.values() because arg may be a Series\u001b[39;00m\n",
+      "\u001b[0;31mKeyError\u001b[0m: \"Column(s) ['project_id'] do not exist\""
+     ]
+    }
+   ],
+   "source": [
+    "all_projects.groupby([\"lead_agency\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n",
+    "    \"project_id\", ascending=False\n",
+    ").head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "daa0b1d3-4416-4537-b568-bdaae9fd1fdb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects[\n",
+    "    (all_projects.county == \"Kern\")\n",
+    "    & (all_projects.project_description.str.contains(\"Seal Coat\"))\n",
+    "].drop(columns=[\"location\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc906308-31d4-4fde-b492-8218b05cec90",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# all_projects.groupby(['project_category','auto_tagged_project_categories']).agg({'project_id':'nunique'})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d2d6ac3a-c517-4df2-b907-0bac0a09e34a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.groupby([\"auto_tagged_project_categories\"]).agg(\n",
+    "    {\"project_id\": \"nunique\"}\n",
+    ").sort_values(\"project_id\", ascending=False).head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5150da00-2a30-4f4d-bec8-1d9e5c66d623",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.groupby([\"project_category\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n",
+    "    \"project_id\", ascending=False\n",
+    ").head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1bf38631-a734-47b0-9465-fcfb8ebafcad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.groupby([\"project_description\"]).agg(\n",
+    "    {\"project_id\": \"nunique\"}\n",
+    ").sort_values(\"project_id\", ascending=False).head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c1baa16-e15c-48e7-9772-ef67755f9d21",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.groupby([\"county\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n",
+    "    \"project_id\", ascending=False\n",
+    ").head(10)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9d55e4ed-9b69-4111-b2ed-69715c9d90c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.lead_agency.nunique()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "01a534d9-75e4-4ff8-aa11-99db480de733",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.total_project_cost.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6985e5d0-cf27-423f-8775-16eb3c518beb",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "all_projects.loc[all_projects.fully_funded == \"Fully funded\"].groupby(\n",
+    "    [\"data_source\"]\n",
+    ").agg({\"project_id\": \"nunique\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3259fc95-2db6-46ad-8cc6-a0357aa19077",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.loc[all_projects.fully_funded == \"Partially funded\"].groupby(\n",
+    "    [\"data_source\"]\n",
+    ").agg({\"project_id\": \"nunique\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2ef08825-9e29-4268-9172-d0d83e08243b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.groupby([\"data_source\"]).agg({\"project_id\": \"nunique\"})"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5fae701e-4132-4d06-8c27-3e598e072172",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.groupby([\"fully_funded\"]).agg(\n",
+    "    {\"project_id\": \"nunique\"}\n",
+    ").reset_index().sort_values(\"project_id\", ascending=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "171611d6-acf9-46d8-9814-20534114d43e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "all_projects.groupby([\"data_source\", \"fully_funded\"]).agg({\"project_id\": \"nunique\"})"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/project_list/compile_all_projects.ipynb b/project_list/compile_all_projects.ipynb
index 1863ec094..7a7079c4c 100644
--- a/project_list/compile_all_projects.ipynb
+++ b/project_list/compile_all_projects.ipynb
@@ -2,117 +2,40 @@
  "cells": [
   {
    "cell_type": "markdown",
-   "id": "77106c12-82aa-4be4-8d9c-e66fafec4d67",
-   "metadata": {
-    "tags": []
-   },
+   "id": "a47ae6dd-278c-42d0-a708-f72088e55f51",
+   "metadata": {},
    "source": [
-    "## General function to clean up data from various grants\n",
-    "To-Do\n",
-    "* De duplicate projects\n",
-    "* Rearrange counties in County column in alphabetical order.\n",
-    "* Millions to thousands -> seems easier to read.\n",
-    "* Differentiate btwn project START year and END year.\n",
-    "* Add Post Mile column\n",
-    "\n",
-    "Done\n",
-    "* Switch City of Berkeley to Berkeley City. https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb\n",
-    "\n",
-    "Strategy/Questions:\n",
-    "* Make sure one row=one project. How? \n",
-    "* What should be the unit of project cost?\n",
-    "* Break up Caltrans by district or leave as is? \n",
-    "\n",
-    "Columns/Data Dictionary\n",
-    "* project_title (str): N/A.\n",
-    "* lead_agency (str): the entity leading the project or receiving the grant.\n",
-    "* project_year (TBD): when the project will begin.\n",
-    "* project_category (str): the category/categories a project belongs to.\n",
-    "* grant_program (str): the fund a project is receiving funds for. This does not preclude the fact that a project can receive funds from mulitple programs. \n",
-    "* phase (str): the latest phase the project is in.\n",
-    "* project_description (str): N/A.\n",
-    "* total_project_cost_(millions): N/A.\n",
-    "* total_available_funds_(millions): all the funds available to the project.\n",
-    "* unfunded_needs_(millions): subtract total_project_cost_(millions) by total_available_funds_(millionis).\n",
-    "* city (str): the city a project is located in.\n",
-    "* county (str): the county a project is lcoated in.\n",
-    "* location (str): an address or more detailed information regarding where the project will take place.\n",
-    "* geometry: geospatial information.\n",
-    "* data_source (str): N/A.\n",
-    "* notes (str): additional information regarding the project.\n",
-    "* funding_notes (str): additional funding information regarding the project.\n",
-    "* ct_district (int): the Caltrans district a project is located in.\n",
-    "* fully_funded (str): comparing total_available_funds_(millions) and total_project_cost_(millions) to figure out whether a project is fully, partially, or not funded.\n",
-    "* enough_info (str): counting the # of null values and # of strings in the project description to determine whether or not a project has enough information."
+    "## Compile Projects\n",
+    "To-do\n",
+    "* Figure out how to version things b/c projects will get updated and we want to track any changes.\n",
+    "* This only needs to be done with data from lp2000 and ctips.\n",
+    "* Need to track changes across all the different dataframes\n",
+    "* Use merges to figure it out?"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "06ac19fe-7b6c-4560-9740-8a4f72c5b6e1",
+   "id": "bd086d78-ffe5-4cf3-9f70-f5b7f3a5cf40",
    "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n",
-      "  warnings.warn(\n",
-      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n",
-      "\n",
-      "import os\n",
-      "os.environ['USE_PYGEOS'] = '0'\n",
-      "import geopandas\n",
-      "\n",
-      "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n",
-      "  import geopandas as gpd\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "import _cleaning_utils\n",
-    "import _harmonization_utils as harmonization_utils\n",
+    "import _harmonization_utils as har_utils\n",
+    "import _lrtp_utils as lrtp_utils\n",
+    "import _sb1_utils as sb1_utils\n",
+    "import _specific_list_utils\n",
     "import _state_rail_plan_utils as srp_utils\n",
-    "# import geopandas as gpd\n",
+    "import geopandas as gpd\n",
     "import pandas as pd\n",
-    "# import shapely\n",
-    "from calitp_data_analysis.sql import to_snakecase"
+    "from calitp_data_analysis.sql import to_snakecase\n",
+    "import hashlib\n",
+    "from datetime import datetime"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "d78be4e7-2349-4ffd-9d59-f9fa450ae7dd",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "'\\nimport re\\nimport nltk\\nfrom nltk import ngrams\\nfrom nltk.corpus import stopwords\\nfrom nltk.tokenize import sent_tokenize, word_tokenize\\nimport re\\nfrom collections import Counter\\nfrom autocorrect import Speller\\n'"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "\"\"\"\n",
-    "import re\n",
-    "import nltk\n",
-    "from nltk import ngrams\n",
-    "from nltk.corpus import stopwords\n",
-    "from nltk.tokenize import sent_tokenize, word_tokenize\n",
-    "import re\n",
-    "from collections import Counter\n",
-    "from autocorrect import Speller\n",
-    "\"\"\""
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "id": "e7b68eeb-422d-4be8-b557-7bd9e95599af",
+   "id": "7a1e769e-8af3-4bb8-87c9-e3b1c64c644b",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -124,629 +47,570 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "0541b671-a020-485f-9b0a-f46238f1d4f9",
+   "execution_count": 3,
+   "id": "9f74e0a0-c1e5-4e42-8d06-11e0ef0464ab",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# lost = harmonization_utils.load_lost()"
+    "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/project_list/\""
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": 5,
-   "id": "db720477-44f5-4cbd-80ac-a0fe86e47cf9",
+   "cell_type": "markdown",
+   "id": "9dc3a31d-a797-4ea0-ad80-a4adb5b4c740",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "def create_notes(df, note_cols: list, new_col_name: str):\n",
-    "    \"\"\"\n",
-    "    Concat multiple columns into one.\n",
-    "    \"\"\"\n",
-    "    prefix = \"_\"\n",
-    "    for column in note_cols:\n",
-    "        df[f\"{prefix}{column}\"] = df[column].astype(str)\n",
-    "    note_cols = [prefix + sub for sub in note_cols]\n",
-    "\n",
-    "    # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values\n",
-    "    def combine_notes(x):\n",
-    "        return \", \".join([col + \": \" + x[col] for col in note_cols])\n",
-    "\n",
-    "    df[new_col_name] = df.apply(combine_notes, axis=1)\n",
-    "    df[new_col_name] = df[new_col_name].str.replace(\"_\", \" \")\n",
-    "\n",
-    "    return df"
+    "### LRTP/LOST"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "78b5d13c-f4ba-4baf-8c3a-f520a960a44a",
+   "execution_count": 19,
+   "id": "67204871-470c-41ec-bf61-6b82b90c88e4",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "96 rows are headers\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:720: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
+      "  sandag.cost2020m.str.replace(\"$\", \"\")\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "65 rows are headers\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "360 rows are headers\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "68 rows are headers\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
+      "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
+     ]
+    }
+   ],
    "source": [
-    "# srp = harmonization_utils.load_state_rail_plan()"
+    "lrtp_lost_df, lrtp_lost_gdf = lrtp_utils.all_mpo(True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "f3829bd6-8fc1-4c15-809f-75020248a722",
+   "execution_count": 21,
+   "id": "052c4887-4ca7-4ae9-bb15-cdbc26544298",
    "metadata": {},
    "outputs": [],
    "source": [
-    "columns_to_keep = [\n",
-    "    \"project_title\",\n",
-    "    \"lead_agency\",\n",
-    "    \"project_year\",\n",
-    "    \"project_category\",\n",
-    "    \"project_start_year\",\n",
-    "    \"project_completion_year\",\n",
-    "    \"grant_program\",\n",
-    "    \"phase\",\n",
-    "    \"project_description\",\n",
-    "    \"total_project_cost_(millions)\",\n",
-    "    \"total_available_funds_(millions)\",\n",
-    "    \"unfunded_needs_(millions)\",\n",
-    "    \"city\",\n",
-    "    \"county\",\n",
-    "    \"location\",\n",
-    "    \"post_mile\",\n",
-    "    \"geometry\",\n",
-    "    \"data_source\",\n",
-    "    \"notes\",\n",
-    "    \"funding_notes\",\n",
-    "    \"ct_district\",\n",
-    "    \"project_description2\",\n",
-    "]"
+    "def unique_project_number(df:pd.DataFrame) -> pd.DataFrame:\n",
+    "    df['timestamp'] = datetime.now().strftime('%Y%m%d%H%M%S')\n",
+    "    df['combo'] = df.apply(lambda row: f\"{row.name}{row['timestamp']}{''.join(map(str, row))}\", axis=1)\n",
+    "    df['project_number'] = df['combo'].apply(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest()[:12])\n",
+    "    df = df.drop(columns = ['combo', 'timestamp'])\n",
+    "    return df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "291e821d-9b3f-40a2-bde9-7a12b31eb410",
+   "execution_count": 22,
+   "id": "4fb60368-eb54-4c9e-a422-c42f45ad74b0",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def harmonizing(\n",
-    "    df,\n",
-    "    agency_name_col: str,\n",
-    "    project_name_col: str,\n",
-    "    project_description_col: str,\n",
-    "    project_category_col: str,\n",
-    "    phase_col: str,\n",
-    "    project_cost_col: str,\n",
-    "    location_col: str,\n",
-    "    geography_col: str,\n",
-    "    post_mile_col:str,\n",
-    "    county_col: str,\n",
-    "    city_col: str,\n",
-    "    district_col:str, \n",
-    "    project_start_year_col: str,\n",
-    "    project_completion_year_col:str,\n",
-    "    program_col: str,\n",
-    "    data_source: str,\n",
-    "    fund_cols: list,\n",
-    "    notes_cols: list,\n",
-    "    cost_in_millions: bool = True,\n",
-    "):\n",
-    "    \"\"\"\n",
-    "    Take a dataset and change the column names/types to\n",
-    "    default names and formats.\n",
-    "    \"\"\"\n",
-    "    rename_columns = {\n",
-    "        agency_name_col: \"lead_agency\",\n",
-    "        project_name_col: \"project_title\",\n",
-    "        project_description_col: \"project_description\",\n",
-    "        project_category_col: \"project_category\",\n",
-    "        project_cost_col: \"total_project_cost_(millions)\",\n",
-    "        location_col: \"location\",\n",
-    "        geography_col: \"geometry\",\n",
-    "        phase_col: \"phase\",\n",
-    "        post_mile_col: \"post_mile\",\n",
-    "        county_col: \"county\",\n",
-    "        city_col: \"city\",\n",
-    "        district_col: \"ct_district\",\n",
-    "        project_start_year_col: \"project_start_year\",\n",
-    "        project_end_year_col: \"project_completion_year\",\n",
-    "        program_col: \"grant_program\",\n",
-    "    }\n",
-    "    # Rename columns\n",
-    "    df = df.rename(columns=rename_columns)\n",
-    "    \n",
-    "    # Clean up monetary columns to be interger\n",
-    "    cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n",
-    "    for i in cost_columns:\n",
-    "        df[i] = df[i].apply(pd.to_numeric, errors=\"coerce\").fillna(0)\n",
-    "    \n",
-    "    # Divide cost columns by millions\n",
-    "    # If bool is set to True\n",
-    "    if cost_in_millions:\n",
-    "        for i in fund_cols + [\"total_project_cost_(millions)\"]:\n",
-    "            df[i] = df[i].divide(1_000_000)\n",
-    "\n",
-    "    # Add new column with funding breakout\n",
-    "    # Since it's summarized above and the details are suppressed.\n",
-    "    df[\"total_available_funds_(millions)\"] = df[fund_cols].sum(axis=1)\n",
-    "    df = create_notes(df, fund_cols, \"funding_notes\")\n",
-    "    \n",
-    "    # Add column for unfunded needs\n",
-    "    df[\"unfunded_needs_(millions)\"] = df[\"total_project_cost_(millions)\"] - df[\"total_available_funds_(millions)\"]\n",
-    "    \n",
-    "    # Add program\n",
-    "    df[\"data_source\"] = data_source\n",
-    "    \n",
-    "    # Create columns even if they don't exist, just to harmonize\n",
-    "    # before concatting.\n",
-    "    create_columns = [\n",
-    "        \"county\",\n",
-    "        \"city\",\n",
-    "        \"notes\",\n",
-    "        \"project_start_year\",\n",
-    "        \"project_completion_year\",\n",
-    "        \"post_mile\",\n",
-    "        \"project_category\",\n",
-    "        \"location\",\n",
-    "        \"phase\",\n",
-    "        \"ct_district\"\n",
-    "    ]\n",
-    "    for column in create_columns:\n",
-    "        if column not in df:\n",
-    "            df[column] = \"None\"\n",
-    "    if \"geometry\" not in df:\n",
-    "        df[\"geometry\"] = None\n",
-    "    if \"grant_program\" not in df:\n",
-    "        df[\"grant_program\"] = data_source\n",
-    "    \n",
-    "    # Create notes - aka other columns that were supressed\n",
-    "    df = create_notes(df, notes_cols, \"notes\")\n",
-    "    \n",
-    "    # Clean up string columns\n",
-    "    string_cols = df.select_dtypes(include=[\"object\"]).columns.to_list()\n",
-    "    for i in string_cols:\n",
-    "        df[i] = df[i].str.replace(\"_\", \" \").str.strip().str.title()\n",
-    "\n",
-    "    # Fill in any nulls\n",
-    "    df['project_description2'] = df.project_description.fillna(df.project_title)\n",
-    "    df = df.fillna(df.dtypes.replace({\"float64\": 0.0, \"object\": \"None\"}))\n",
-    "\n",
-    "    # Only keep certain columns\n",
-    "    df = df[columns_to_keep]\n",
-    "    return df"
+    "lrtp_lost_df = unique_project_number(lrtp_lost_df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "0ea5badb-841b-4941-b48f-23d750b5ed27",
+   "execution_count": 23,
+   "id": "846f7621-4999-4a7d-b2f7-ec0a18b19ac5",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "16276"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "def harmonize_srp():\n",
-    "    df = harmonization_utils.load_state_rail_plan()\n",
-    "    df = harmonizing(\n",
-    "        df,\n",
-    "        agency_name_col=\"lead_agency\",\n",
-    "        project_name_col=\"project_name\",\n",
-    "        project_description_col=\"project_description\",\n",
-    "        project_category_col=\"project_category\",\n",
-    "        phase_col=\"\",\n",
-    "        project_cost_col=\"total_project_cost\",\n",
-    "        location_col=\"corridor\",\n",
-    "        geography_col=\"\",\n",
-    "        county_col=\"\",\n",
-    "        city_col=\"\",\n",
-    "        district_col=\"\",\n",
-    "        project_year_col=\"\",\n",
-    "        program_col=\"\",\n",
-    "        data_source=\"State Rail Plan\",\n",
-    "        fund_cols=[],\n",
-    "        notes_cols = ['project_time_horizon','srp_region', \n",
-    "       'sub_corridor_node_1', 'sub_corridor_node_2', 'itsp_corridor'],\n",
-    "        cost_in_millions=True,\n",
-    "    )\n",
-    "\n",
-    "    return df"
+    "lrtp_lost_df.project_number.nunique()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
-   "id": "2b60a4e7-cc69-41fb-9285-c32f9fa0791e",
+   "execution_count": 55,
+   "id": "d5cd997b-dc72-4f04-9950-158fe13c25fb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# srp_harmonized = harmonize_srp()"
+    "def separate_out_df(df:pd.DataFrame, columns_to_keep: list)-> pd.DataFrame:\n",
+    "    \"\"\"\n",
+    "    Subset the column, drop any rows \n",
+    "    in which the values are Nan or \"None.\"\n",
+    "    \"\"\"\n",
+    "    # Subset\n",
+    "    df2 = df[columns_to_keep]\n",
+    "    \n",
+    "    # Fill in missing values\n",
+    "    try:\n",
+    "        df2 = df2.fillna('none')\n",
+    "    except:\n",
+    "        df2\n",
+    "    \n",
+    "    # Remove project_number and keep only cols of interest\n",
+    "    columns_to_keep.remove('project_number')\n",
+    "    \n",
+    "    # Drop rows that are nan or \"None\" based on how many columns are listed\n",
+    "    if len(columns_to_keep) == 1:\n",
+    "        df2 = df2.dropna(how = \"any\")\n",
+    "        df2 = df2[df2.applymap(lambda x: x.lower() if isinstance(x, str) else x) != 'none'].dropna()\n",
+    "        \n",
+    "    # If there are more than one column to separate out,\n",
+    "    # keep any row that has a non-null value \n",
+    "    else:\n",
+    "        df2 = df2.dropna(how = \"all\", subset = columns_to_keep)\n",
+    "    return df2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
-   "id": "2100f3b7-55c8-45ad-b3d1-99a0319c7ac8",
+   "execution_count": 25,
+   "id": "8acf0ccd-3b35-41db-9320-3352b9b4e813",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# srp_harmonized.tail()"
+    "lrtp_lost_county = separate_out_df(lrtp_lost_df, ['project_number', 'county'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
-   "id": "3142aacb-d5f4-4bc1-8cc8-99f50c45b301",
+   "execution_count": 26,
+   "id": "c75b08c0-4da0-44d0-b675-dc848240b994",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(16276, 11)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# srp_og = harmonization_utils.load_state_rail_plan()"
+    "lrtp_lost_df.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
-   "id": "0ae0b8bd-3e5b-4119-8fee-d496689f9c7c",
+   "execution_count": 27,
+   "id": "fce5ffc2-1b46-4f8d-bbb1-2a9251d35a8c",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4012, 2)"
+      ]
+     },
+     "execution_count": 27,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# srp_og.sample()"
+    "lrtp_lost_county.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
-   "id": "9307a340-c699-4d93-ba30-abe04563dd8d",
+   "execution_count": 28,
+   "id": "71ff78d9-cafc-4bc4-bc68-2f0e001a5acd",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# srp_og.columns"
+    "lrtp_lost_city = separate_out_df(lrtp_lost_df, ['project_number', 'city'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
-   "id": "23804222-466a-4754-a1ad-fd8f3f8a5239",
+   "execution_count": 29,
+   "id": "4116f424-755e-4382-98c7-c3f0c65c5514",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(745, 2)"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "def harmonize_lost():\n",
-    "    df = harmonization_utils.load_lost()\n",
-    "    df = harmonizing(\n",
-    "        df,\n",
-    "        agency_name_col=\"agency\",\n",
-    "        project_name_col=\"project_title\",\n",
-    "        project_description_col=\"project_description\",\n",
-    "        project_category_col=\"project_category\",\n",
-    "        project_cost_col=\"cost__in_millions_\",\n",
-    "        phase_col=\"\",\n",
-    "        location_col=\"location\",\n",
-    "        geography_col=\"\",\n",
-    "        county_col=\"county\",\n",
-    "        city_col=\"city\",\n",
-    "        district_col = \"\",\n",
-    "        project_year_col=\"\",\n",
-    "        program_col=\"measure\",\n",
-    "        data_source=\"Local Options Sales Tax\",\n",
-    "        fund_cols=[\n",
-    "            \"estimated_lost_funds\",\n",
-    "            \"estimated_federal_funds\",\n",
-    "            \"estimated_state_funds\",\n",
-    "            \"estimated_local_funds\",\n",
-    "            \"estimated_other_funds\",\n",
-    "        ],\n",
-    "        notes_cols = [\"notes\"],\n",
-    "        cost_in_millions=False,\n",
-    "    )\n",
-    "\n",
-    "    return df"
+    "lrtp_lost_city.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
-   "id": "e13f87d5-514f-404f-8cc8-4dbf877754da",
+   "execution_count": 30,
+   "id": "829f6817-f0f7-4193-9ce8-f9732baff8d8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# lost_og = harmonization_utils.load_lost()"
+    "lrtp_lost_agency = separate_out_df(lrtp_lost_df, ['project_number', 'lead_agency'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
-   "id": "60d66f94-7915-43b7-990e-896600e20d40",
+   "execution_count": 31,
+   "id": "78df708c-82cf-44ae-99f4-dd84e7c40dc1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(16276, 2)"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# lost_og.columns"
+    "lrtp_lost_agency.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
-   "id": "20e8a81a-e6b1-4bdf-a0f8-21420c62b68a",
+   "execution_count": 32,
+   "id": "dd61d79c-9999-4c11-9353-47ac4f16b8b1",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_number</th>\n",
+       "      <th>lead_agency</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>61</th>\n",
+       "      <td>8d1631fce5bc</td>\n",
+       "      <td>Slocog</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   project_number lead_agency\n",
+       "61   8d1631fce5bc      Slocog"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "def harmonize_sb1():\n",
-    "    df = harmonization_utils.load_sb1()\n",
-    "    df = harmonizing(\n",
-    "        df,\n",
-    "        agency_name_col=\"implementingagency\",\n",
-    "        project_name_col=\"projecttitle_x\",\n",
-    "        project_description_col=\"projectdescription\",\n",
-    "        project_category_col=\"\",\n",
-    "        phase_col=\"projectstatuses\",\n",
-    "        project_cost_col=\"totalcost\",\n",
-    "        location_col=\"\",\n",
-    "        geography_col=\"geometry\",\n",
-    "        county_col=\"countynames\",\n",
-    "        city_col=\"citynames\",\n",
-    "        district_col = \"ct_districts\",\n",
-    "        project_year_col=\"fiscalyears\",\n",
-    "        program_col=\"programcodes\",\n",
-    "        data_source=\"SB1\",\n",
-    "        fund_cols=[\"sb1funds\", \"iijafunds\"],\n",
-    "        notes_cols = ['iijaprogram','dateupdated','isonshs', 'isonshscodes','agencies', 'popup'],\n",
-    "        cost_in_millions=True,\n",
-    "    )\n",
-    "\n",
-    "    return df"
+    "lrtp_lost_agency.sample()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
-   "id": "4d39d086-ef36-4f21-ab44-17980304be74",
+   "execution_count": 33,
+   "id": "5c3dfa8a-480a-4062-b59c-0ce5dff75cdc",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# sb1_og = harmonization_utils.load_sb1()"
+    "lrtp_lost_geo = separate_out_df(lrtp_lost_df, ['project_number', 'geometry'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
-   "id": "9c65170e-17ef-42da-b161-358e40f815a8",
+   "execution_count": 34,
+   "id": "40998366-218c-4c9b-b890-dc97769e3893",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((1357, 2), (1355, 10))"
+      ]
+     },
+     "execution_count": 34,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# sb1_og.columns"
+    "lrtp_lost_geo.shape, lrtp_lost_gdf.shape"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
-   "id": "5ade46ae-4768-4855-b0ea-9ff4ec7607af",
+   "execution_count": 35,
+   "id": "92fe5891-014e-4177-beb6-1e83c957d0fb",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# sb1_og.drop(columns = ['geometry']).sample(3)"
+    "lrtp_to_drop = ['county', 'city', 'lead_agency', 'geometry']"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
-   "id": "5cab53c4-9c63-4bd4-b837-f43e62900e8d",
+   "execution_count": 36,
+   "id": "29ca0b11-6244-455b-96b2-dd53144d7c0e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# harmonized_sb1 = harmonize_sb1()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6c14df71-56af-43a1-b0e0-2d02ef38e18e",
-   "metadata": {},
-   "source": [
-    "### Stacking"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "53a8e2a2-9d49-4e55-a2ee-bd6224d7fb61",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "#### Does this project have enough information to be useful?"
+    "# lrtp_lost_df = lrtp_lost_df.drop(columns = lrtp_to_drop)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
-   "id": "3926aa77-991b-48be-b57d-04077a0a485b",
+   "execution_count": 37,
+   "id": "3a475324-4f2d-41a3-8b1f-ac4f0eaf0962",
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "def categorize_info(df): \n",
-    " \n",
-    "    #Get percentiles in objects for total vehicle.\n",
-    "    p50_project_desc= df.project_description_count.quantile(0.50).astype(float)\n",
-    "    p50_null_values = df.total_percent_null_values.quantile(0.50).astype(float)\n",
-    "    \n",
-    "    #Function for fleet size\n",
-    "    def percentile_info (row):\n",
-    "        if ((row.project_description_count >= p50_project_desc) and (row.total_percent_null_values <= p50_null_values)):\n",
-    "            return \"Yes\"\n",
-    "        else: \n",
-    "            return \"No\"\n",
-    "    df[\"enough_info\"] = df.apply(lambda x: percentile_info(x), axis=1)\n",
-    "  \n",
-    "    return df    "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "id": "48854cb1-3fa8-4d4e-8e8f-7218fc8b9c7e",
-   "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_title</th>\n",
+       "      <th>lead_agency</th>\n",
+       "      <th>project_year</th>\n",
+       "      <th>project_description</th>\n",
+       "      <th>total_project_cost</th>\n",
+       "      <th>geometry</th>\n",
+       "      <th>city</th>\n",
+       "      <th>county</th>\n",
+       "      <th>data_source</th>\n",
+       "      <th>notes</th>\n",
+       "      <th>project_number</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>416</th>\n",
+       "      <td>None</td>\n",
+       "      <td>Stancog</td>\n",
+       "      <td>2022-2046</td>\n",
+       "      <td>Bicycle Lane (Class 2), Buffered Bicycle Lane (Class 2), Bicycle Route With Wide Shoulders (Class 3.5), Separated Bike Lane (Class 4), And Pedestrian Improvements.\\n(Non-Motorized Transportation Plan Top 25: Route 25)</td>\n",
+       "      <td>8027400.00</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Stancog Lrtp</td>\n",
+       "      <td>Jurisdiction: Stanislaus County,  Location: Rhode Rd/7Th St/Nunes Rd/N. Golden State Blvd,  Project Limits: Moore Rd To W. Christofferson Pkwy,  Funding Source: Atp, Sb 1, Bil/Iija, Cmaq, Stbgp,  System Preserv : Nan,  Capacity Enhance : Nan,  Safety: Nan,  Oper : Nan,  Complete Streets: Nan,  Active\\nTransporta Tion: X,  Transit: Nan,  Other: Nan</td>\n",
+       "      <td>a32f75c83b70</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "    project_title lead_agency project_year  \\\n",
+       "416          None     Stancog    2022-2046   \n",
+       "\n",
+       "                                                                                                                                                                                                           project_description  \\\n",
+       "416  Bicycle Lane (Class 2), Buffered Bicycle Lane (Class 2), Bicycle Route With Wide Shoulders (Class 3.5), Separated Bike Lane (Class 4), And Pedestrian Improvements.\\n(Non-Motorized Transportation Plan Top 25: Route 25)   \n",
+       "\n",
+       "     total_project_cost geometry  city county   data_source  \\\n",
+       "416          8027400.00     None  None   None  Stancog Lrtp   \n",
+       "\n",
+       "                                                                                                                                                                                                                                                                                                                                                             notes  \\\n",
+       "416  Jurisdiction: Stanislaus County,  Location: Rhode Rd/7Th St/Nunes Rd/N. Golden State Blvd,  Project Limits: Moore Rd To W. Christofferson Pkwy,  Funding Source: Atp, Sb 1, Bil/Iija, Cmaq, Stbgp,  System Preserv : Nan,  Capacity Enhance : Nan,  Safety: Nan,  Oper : Nan,  Complete Streets: Nan,  Active\\nTransporta Tion: X,  Transit: Nan,  Other: Nan   \n",
+       "\n",
+       "    project_number  \n",
+       "416   a32f75c83b70  "
+      ]
+     },
+     "execution_count": 37,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "def enough_info(df):\n",
-    "    # Select string columns\n",
-    "    string_cols = df.select_dtypes(include=[\"object\"]).columns.to_list()\n",
-    "    \n",
-    "    # https://stackoverflow.com/questions/73839250/count-number-of-occurrences-of-text-over-row-python-pandas\n",
-    "    # Count \"nones\" in string columns\n",
-    "    df['none_counts'] = df[string_cols].astype(str).sum(axis=1).str.lower().str.count(\"none\")\n",
-    "    \n",
-    "    # Count zeroes\n",
-    "    df['zero_counts'] = (df == 0).astype(int).sum(axis=1)\n",
-    "    \n",
-    "    # Total up all none/zeroes \n",
-    "    df[\"total_percent_null_values\"] = df[['none_counts','zero_counts']].sum(axis=1)/len(df.columns) * 100\n",
-    "    \n",
-    "    # Count project descriptions\n",
-    "    df[\"project_description_count\"] = df[\"project_description\"].str.count('\\w+')\n",
-    "    \n",
-    "    # Categorize whether it has enough info or not\n",
-    "    df = categorize_info(df)\n",
-    "    \n",
-    "    # Compress columns to retain some info\n",
-    "    df['counts'] = 'number of strings in project desc: ' + df.project_description_count.astype(str) + ' % of null values:' + df.total_percent_null_values.astype(int).astype(str)\n",
-    "    \n",
-    "    df = df.drop(columns = ['none_counts','zero_counts','project_description_count','total_percent_null_values'])\n",
-    "    return df "
+    "lrtp_lost_df.sample()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "91c4e4b0-f28d-4956-9274-d17a3306801e",
-   "metadata": {
-    "tags": []
-   },
-   "source": [
-    "#### Correct lead agencies again"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 123,
-   "id": "f0f564ce-5551-4750-94b6-bb7c5b056949",
+   "id": "0d3280f2-67c1-418d-8161-4a577b9d3034",
    "metadata": {},
-   "outputs": [],
-   "source": [
-    "def flip_county_city(df, agency_col:str):\n",
-    "    # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb\n",
-    "    to_correct = df[(df[agency_col].str.contains('County')) | (df[agency_col].str.contains('City'))]\n",
-    "    to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop = True)\n",
-    "    to_correct['str_len'] = to_correct[agency_col].str.split().str.len()\n",
-    "    to_correct = to_correct[to_correct.str_len <= 5 ].reset_index(drop = True)\n",
-    "    to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)\n",
-    "    to_correct['new_name'] = to_correct['name_pt2'] + ' ' + to_correct['name_pt1']\n",
-    "    \n",
-    "    new_names_dictionary = (dict(to_correct[[agency_col, 'new_name']].values))\n",
-    "    df['agency_corrected'] = df[agency_col].map(new_names_dictionary)\n",
-    "    df['agency_corrected'] = df['agency_corrected'].fillna(df[agency_col])\n",
-    "    \n",
-    "    df = df.drop(columns = [agency_col])\n",
-    "    df = df.rename(columns = {\"agency_corrected\":agency_col})\n",
-    "    \n",
-    "    return df "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 126,
-   "id": "3946f71e-f987-452b-8269-331d6cb461c7",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
-   "outputs": [],
    "source": [
-    "# all_projects_metric.lead_agency.value_counts()"
+    "### SB1"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
-   "id": "0bd2a79a-700b-446e-8346-5aa6fb2309f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def add_all_projects():\n",
-    "\n",
-    "    # Load  dataframes\n",
-    "    state_rail_plan = harmonize_srp()\n",
-    "    lost = harmonize_lost()\n",
-    "    sb1 = harmonize_sb1()\n",
-    "\n",
-    "    # Concat for df\n",
-    "    df = pd.concat([lost, state_rail_plan, sb1])\n",
-    "    \n",
-    "    # Clean agency names\n",
-    "    df = harmonization_utils.organization_cleaning(df, \"lead_agency\")\n",
-    "    df = flip_county_city(df, 'lead_agency')\n",
-    "    \n",
-    "    # Determine if the project completely funded or not?\n",
-    "    # Add up all available funds\n",
-    "    df[\"fully_funded\"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)\n",
-    "    \n",
-    "    # Does this project have enough info?\n",
-    "    df = enough_info(df)\n",
-    "    \n",
-    "    \n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 26,
-   "id": "5dcafef7-30b9-4582-93c8-188ede6b8562",
+   "execution_count": 38,
+   "id": "6d530178-4f39-475b-b533-822b0b19f237",
    "metadata": {},
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
-      "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
-      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:30: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
-      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:37: FutureWarning: The default value of regex will change from True to False in a future version.\n",
-      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:30: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
-      "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
-      "/home/jovyan/data-analyses/project_list/_harmonization_utils.py:34: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n"
+      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:23: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
+      "  gdf[i]\n",
+      "/home/jovyan/data-analyses/project_list/_sb1_utils.py:23: FutureWarning: The default value of regex will change from True to False in a future version.\n",
+      "  gdf[i]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "0 rows are mising geometry\n",
+      "7917 rows contain invalid geography\n"
      ]
     }
    ],
    "source": [
-    "all_projects = add_all_projects()"
+    "sb1_df = sb1_utils.load_sb1()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
-   "id": "64c6a43d-0a8c-4f7c-a3cc-df3415163bf4",
+   "execution_count": 40,
+   "id": "7ee96e11-1d68-4991-8089-5749adacb311",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Index(['project_title', 'lead_agency', 'project_year', 'project_category',\n",
-       "       'grant_program', 'phase', 'project_description',\n",
-       "       'total_project_cost_(millions)', 'total_available_funds_(millions)',\n",
-       "       'unfunded_needs_(millions)', 'city', 'county', 'location', 'geometry',\n",
-       "       'data_source', 'notes', 'funding_notes', 'ct_district',\n",
-       "       'project_description2', 'fully_funded', 'enough_info', 'counts'],\n",
-       "      dtype='object')"
-      ]
-     },
-     "execution_count": 27,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "all_projects.columns"
+    "sb1_df = unique_project_number(sb1_df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 53,
-   "id": "78825d55-c1b0-447b-b33e-493c7165aa25",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
+   "execution_count": 76,
+   "id": "931ef88d-caf9-4201-bc18-d874197be065",
+   "metadata": {},
    "outputs": [
     {
      "data": {
@@ -769,1853 +633,1019 @@
        "  <thead>\n",
        "    <tr style=\"text-align: right;\">\n",
        "      <th></th>\n",
-       "      <th>project_title</th>\n",
-       "      <th>lead_agency</th>\n",
-       "      <th>project_year</th>\n",
-       "      <th>project_category</th>\n",
-       "      <th>grant_program</th>\n",
-       "      <th>phase</th>\n",
-       "      <th>project_description</th>\n",
-       "      <th>total_project_cost_(millions)</th>\n",
-       "      <th>total_available_funds_(millions)</th>\n",
-       "      <th>unfunded_needs_(millions)</th>\n",
-       "      <th>city</th>\n",
-       "      <th>county</th>\n",
-       "      <th>location</th>\n",
-       "      <th>data_source</th>\n",
-       "      <th>notes</th>\n",
-       "      <th>funding_notes</th>\n",
-       "      <th>ct_district</th>\n",
-       "      <th>project_description2</th>\n",
-       "      <th>fully_funded</th>\n",
-       "      <th>enough_info</th>\n",
-       "      <th>counts</th>\n",
+       "      <th>projectid</th>\n",
+       "      <th>projname</th>\n",
+       "      <th>projcatcode</th>\n",
+       "      <th>projcategory</th>\n",
+       "      <th>projprogcode</th>\n",
+       "      <th>projprogram</th>\n",
+       "      <th>multiprogfunded</th>\n",
+       "      <th>projstatus</th>\n",
+       "      <th>description</th>\n",
+       "      <th>cost</th>\n",
+       "      <th>assemblydistrict</th>\n",
+       "      <th>senatedistrict</th>\n",
+       "      <th>assemblycode</th>\n",
+       "      <th>senatecode</th>\n",
+       "      <th>countyname</th>\n",
+       "      <th>cityname</th>\n",
+       "      <th>countycode</th>\n",
+       "      <th>citycode</th>\n",
+       "      <th>appagencyname</th>\n",
+       "      <th>impagencyname</th>\n",
+       "      <th>geometry</th>\n",
+       "      <th>totalcosts</th>\n",
+       "      <th>routes</th>\n",
+       "      <th>constyear</th>\n",
+       "      <th>costfull</th>\n",
+       "      <th>projagency</th>\n",
+       "      <th>project_number</th>\n",
+       "      <th>assembly_same</th>\n",
+       "      <th>senate_same</th>\n",
        "    </tr>\n",
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>358</th>\n",
-       "      <td>None</td>\n",
+       "      <th>3449</th>\n",
+       "      <td>LsrFy17185261Pp030</td>\n",
+       "      <td>Slurry Seal Parker Avenue</td>\n",
+       "      <td>Local</td>\n",
+       "      <td>Local And Regional</td>\n",
+       "      <td>Lsr1718</td>\n",
+       "      <td>201718 Local Streets And Roads</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>PreConstruction</td>\n",
+       "      <td>Slurry Seal 16324 Sf For Roadway With 69 Pci</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>24</td>\n",
+       "      <td>13</td>\n",
+       "      <td>24</td>\n",
+       "      <td>13</td>\n",
+       "      <td>San Mateo</td>\n",
+       "      <td>Atherton  Submitted By City</td>\n",
+       "      <td>Sm</td>\n",
+       "      <td>Atn</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
        "      <td>None</td>\n",
-       "      <td>19/20, 20/21</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Sgr</td>\n",
-       "      <td>In Progress, Planned</td>\n",
-       "      <td>None</td>\n",
-       "      <td>0.12</td>\n",
-       "      <td>0.12</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>Corcoran</td>\n",
-       "      <td>Kings</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Sb1</td>\n",
-       "      <td>Iijaprogram: ,  Dateupdated: 2021-09-09,  Isonshs: N,  Isonshscodes: N,  Agencies: City Of Corcoran,  Popup: None</td>\n",
-       "      <td>Sb1Funds: 0.121909,  Iijafunds: 0.0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Fully funded</td>\n",
-       "      <td>No</td>\n",
-       "      <td>number of strings in project desc: 1 % of null values:40</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1085</th>\n",
-       "      <td>Spring Street Overlay</td>\n",
-       "      <td>City Of Signal Hill</td>\n",
-       "      <td>19/20</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Lsr</td>\n",
-       "      <td>Planned</td>\n",
-       "      <td>None</td>\n",
-       "      <td>3.00</td>\n",
-       "      <td>0.13</td>\n",
-       "      <td>2.87</td>\n",
-       "      <td>Signal Hill</td>\n",
-       "      <td>Los Angeles</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Sb1</td>\n",
-       "      <td>Iijaprogram: ,  Dateupdated: 6/30/2021,  Isonshs: N,  Isonshscodes: N,  Agencies: City Of Signal Hill,  Popup: None</td>\n",
-       "      <td>Sb1Funds: 0.126705,  Iijafunds: 0.0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Partially funded</td>\n",
-       "      <td>No</td>\n",
-       "      <td>number of strings in project desc: 1 % of null values:27</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2106</th>\n",
-       "      <td>Major Damage Restoration</td>\n",
-       "      <td>Caltrans</td>\n",
-       "      <td>20/21</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>In Progress</td>\n",
-       "      <td>A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.</td>\n",
-       "      <td>16.52</td>\n",
-       "      <td>9.08</td>\n",
-       "      <td>7.44</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Del Norte</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Sb1</td>\n",
-       "      <td>Iijaprogram: State Hwy Operations &amp; Protection Program Major-Federal,  Dateupdated: 2022-06-28,  Isonshs: None,  Isonshscodes: Y,  Agencies: Caltrans,  Popup: Major Damage Restorationbr</td>\n",
-       "      <td>Sb1Funds: 0.0,  Iijafunds: 9.083566</td>\n",
-       "      <td>01</td>\n",
-       "      <td>A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.</td>\n",
-       "      <td>Partially funded</td>\n",
-       "      <td>Yes</td>\n",
-       "      <td>number of strings in project desc: 25 % of null values:18</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2018</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Atherton    City</td>\n",
+       "      <td>c61c443fb21d</td>\n",
+       "      <td>True</td>\n",
+       "      <td>True</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
        "</table>\n",
        "</div>"
       ],
       "text/plain": [
-       "                 project_title          lead_agency  project_year  \\\n",
-       "358                       None                 None  19/20, 20/21   \n",
-       "1085     Spring Street Overlay  City Of Signal Hill         19/20   \n",
-       "2106  Major Damage Restoration             Caltrans         20/21   \n",
-       "\n",
-       "     project_category grant_program                 phase  \\\n",
-       "358              None           Sgr  In Progress, Planned   \n",
-       "1085             None           Lsr               Planned   \n",
-       "2106             None         Shopp           In Progress   \n",
+       "               projectid                   projname projcatcode  \\\n",
+       "3449  LsrFy17185261Pp030  Slurry Seal Parker Avenue       Local   \n",
        "\n",
-       "                                                                                                                                         project_description  \\\n",
-       "358                                                                                                                                                     None   \n",
-       "1085                                                                                                                                                    None   \n",
-       "2106  A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.   \n",
+       "            projcategory projprogcode                     projprogram  \\\n",
+       "3449  Local And Regional      Lsr1718  201718 Local Streets And Roads   \n",
        "\n",
-       "      total_project_cost_(millions)  total_available_funds_(millions)  \\\n",
-       "358                            0.12                              0.12   \n",
-       "1085                           3.00                              0.13   \n",
-       "2106                          16.52                              9.08   \n",
+       "     multiprogfunded       projstatus  \\\n",
+       "3449             NaN  PreConstruction   \n",
        "\n",
-       "      unfunded_needs_(millions)         city       county location  \\\n",
-       "358                        0.00     Corcoran        Kings     None   \n",
-       "1085                       2.87  Signal Hill  Los Angeles     None   \n",
-       "2106                       7.44         None    Del Norte     None   \n",
+       "                                       description cost assemblydistrict  \\\n",
+       "3449  Slurry Seal 16324 Sf For Roadway With 69 Pci  NaN               24   \n",
        "\n",
-       "     data_source  \\\n",
-       "358          Sb1   \n",
-       "1085         Sb1   \n",
-       "2106         Sb1   \n",
+       "     senatedistrict assemblycode senatecode countyname  \\\n",
+       "3449             13           24         13  San Mateo   \n",
        "\n",
-       "                                                                                                                                                                                          notes  \\\n",
-       "358                                                                           Iijaprogram: ,  Dateupdated: 2021-09-09,  Isonshs: N,  Isonshscodes: N,  Agencies: City Of Corcoran,  Popup: None   \n",
-       "1085                                                                        Iijaprogram: ,  Dateupdated: 6/30/2021,  Isonshs: N,  Isonshscodes: N,  Agencies: City Of Signal Hill,  Popup: None   \n",
-       "2106  Iijaprogram: State Hwy Operations & Protection Program Major-Federal,  Dateupdated: 2022-06-28,  Isonshs: None,  Isonshscodes: Y,  Agencies: Caltrans,  Popup: Major Damage Restorationbr   \n",
+       "                         cityname countycode citycode appagencyname  \\\n",
+       "3449  Atherton  Submitted By City         Sm      Atn           NaN   \n",
        "\n",
-       "                            funding_notes ct_district  \\\n",
-       "358   Sb1Funds: 0.121909,  Iijafunds: 0.0        None   \n",
-       "1085  Sb1Funds: 0.126705,  Iijafunds: 0.0        None   \n",
-       "2106  Sb1Funds: 0.0,  Iijafunds: 9.083566          01   \n",
+       "     impagencyname geometry totalcosts routes constyear  costfull  \\\n",
+       "3449           NaN     None        NaN    NaN      2018       NaN   \n",
        "\n",
-       "                                                                                                                                        project_description2  \\\n",
-       "358                                                                                                                                                     None   \n",
-       "1085                                                                                                                                                    None   \n",
-       "2106  A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.   \n",
-       "\n",
-       "          fully_funded enough_info  \\\n",
-       "358       Fully funded          No   \n",
-       "1085  Partially funded          No   \n",
-       "2106  Partially funded         Yes   \n",
-       "\n",
-       "                                                         counts  \n",
-       "358    number of strings in project desc: 1 % of null values:40  \n",
-       "1085   number of strings in project desc: 1 % of null values:27  \n",
-       "2106  number of strings in project desc: 25 % of null values:18  "
+       "            projagency project_number  assembly_same  senate_same  \n",
+       "3449  Atherton    City   c61c443fb21d           True         True  "
       ]
      },
-     "execution_count": 53,
+     "execution_count": 76,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_projects.drop(columns = ['geometry']).sample(3)"
+    "sb1_df.sample()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
-   "id": "0983ba29-f492-4a1a-ad40-78ebd291f7d2",
+   "execution_count": 41,
+   "id": "e37cf474-f657-4dbb-8969-23e14001ac84",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Shopp                              1631\n",
-       "Imperial D 2008                     726\n",
-       "Hm                                  520\n",
-       "Lsr                                 285\n",
-       "State Rail Plan                     276\n",
-       "Atp                                 216\n",
-       "Sgr                                 156\n",
-       "Stip                                126\n",
-       "San Mateo W 2018                     91\n",
-       "Los Angeles Angeles M 2016           89\n",
-       "San Benito G 2004                    86\n",
-       "Santa Clara B 2016                   85\n",
-       "Tircp                                82\n",
-       "Shopa                                79\n",
-       "San Mateo A2 2006                    78\n",
-       "Alameda B 2000                       62\n",
-       "San Diego A 2004                     59\n",
-       "San Joaquin K 2003                   56\n",
-       "Tcep                                 55\n",
-       "San Bernardino I2 2018               51\n",
-       "Sacramento A2 2004                   51\n",
-       "Tulare R 2006                        49\n",
-       "Sta                                  49\n",
-       "Sonoma M 2004                        44\n",
-       "Alameda Bb 2014                      40\n",
-       "Lpp-F                                40\n",
-       "Santa Barbara A 2008                 37\n",
-       "Los Angeles Angeles R 2008           37\n",
-       "Madera T 2006                        36\n",
-       "Sccp                                 34\n",
-       "San Francisco K 2004                 28\n",
-       "Riverside A2 2006                    27\n",
-       "Lpp-C                                21\n",
-       "Stanislaus L 2016                    20\n",
-       "Contra Costa J 2004                  19\n",
-       "Orange M2 2002                       19\n",
-       "Santa Clara A 2000                   14\n",
-       "Sra                                  11\n",
-       "Monterey X 2016                      11\n",
-       "Santa Cruz D 2016                     9\n",
-       "Marin A 2004                          7\n",
-       "Monterey Salinas Transit Q 2016       6\n",
-       "Sonoma Q 2008                         5\n",
-       "Fresno C 2006                         5\n",
-       "Los Angelest Alameda Bb 2014          1\n",
-       "Santa Clara B 2008                    1\n",
-       "Name: grant_program, dtype: int64"
+       "9186"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 41,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_projects.grant_program.value_counts()"
+    "len(sb1_df)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
-   "id": "301b9bde-499e-445d-a27c-f50f522e4aa9",
+   "execution_count": 42,
+   "id": "eb4314bc-8d4f-4a1b-8425-034091a4f57f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "Sb1                        3305\n",
-       "Local Options Sales Tax    1849\n",
-       "State Rail Plan             276\n",
-       "Name: data_source, dtype: int64"
+       "9186"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 42,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_projects.data_source.value_counts()"
+    "sb1_df.project_number.nunique()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b09c99bf-a44e-4c8d-a9fb-fb3a86ffa1eb",
+   "metadata": {},
+   "source": [
+    "#### Check that assemblydistrict and assemblycode are the same values"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 31,
-   "id": "0c066920-6b09-4584-bc82-4f88b41e00d8",
+   "execution_count": 43,
+   "id": "2ec3775a-1d30-4f88-bea1-37f55a29e767",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sb1_df['assembly_same'] = sb1_df.assemblycode == sb1_df.assemblydistrict"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "41b4f512-89a6-4a08-b0bd-f1ed309d9c34",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "0.00    20.06\n",
-       "0.33     2.65\n",
-       "0.25     1.25\n",
-       "7.61     0.85\n",
-       "17.86    0.77\n",
-       "Name: total_project_cost_(millions), dtype: float64"
+       "True     7217\n",
+       "False    1969\n",
+       "Name: assembly_same, dtype: int64"
       ]
      },
-     "execution_count": 31,
+     "execution_count": 44,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_projects[\"total_project_cost_(millions)\"].value_counts().head() / len(all_projects) * 100"
+    "sb1_df.assembly_same.value_counts()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 52,
-   "id": "413ac763-c08b-48b0-91d5-6e53fd8f2c32",
+   "execution_count": 45,
+   "id": "76325c35-f25b-4635-8601-3f576557abee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sb1_df['senate_same'] = sb1_df.senatedistrict == sb1_df.senatecode"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "5962a32a-c9d2-4373-8e05-329b62b42256",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "No available funding info    1963\n",
-       "Partially funded             1796\n",
-       "No project cost info         1089\n",
-       "Fully funded                  582\n",
-       "Name: fully_funded, dtype: int64"
+       "True     6950\n",
+       "False    2236\n",
+       "Name: senate_same, dtype: int64"
       ]
      },
-     "execution_count": 52,
+     "execution_count": 46,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_projects.fully_funded.value_counts()"
+    "sb1_df.senate_same.value_counts()"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "8add5491-77d7-4eaa-ad79-57072f7eddd9",
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "0a1360f5-eb9a-49a9-947e-249f3d8e9b99",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Metrics\n",
-    "* Rewrite to be shorter?\n",
-    "* Correct spelling of descriptions?\n",
-    "* https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305"
+    "sb1_county = separate_out_df(sb1_df, ['project_number', 'countyname'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 65,
-   "id": "6d6253cd-b5f8-4431-a575-9a274e6e8bae",
+   "execution_count": 48,
+   "id": "0330602b-7b39-48c9-835f-d7cbbd4fcd3f",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def add_categories(df):\n",
-    "    \"\"\"\n",
-    "    Create general categories for each projects.\n",
-    "    https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305\n",
-    "    \"\"\"\n",
-    "    # There are many projects that are \n",
-    "    ACTIVE_TRANSPORTATION = ['bike', 'bicycle', 'cyclist', \n",
-    "                             'pedestrian', \n",
-    "                             ## including the spelling errors of `pedestrian`\n",
-    "                             'pedestrain',\n",
-    "                             'crosswalk', \n",
-    "                             'bulb out', 'bulb-out', \n",
-    "                             'active transp', 'traffic reduction', \n",
-    "                             'speed reduction', 'ped', 'srts', \n",
-    "                             'safe routes to school',\n",
-    "                             'sidewalk', 'side walk', 'Cl ', 'trail',\n",
-    "                             'atp'\n",
-    "                            ]\n",
-    "    TRANSIT = ['bus', 'metro', 'station', #Station comes up a few times as a charging station and also as a train station\n",
-    "               'transit','fare', 'brt', 'yarts', 'railroad', 'highway-rail'\n",
-    "               # , 'station' in description and 'charging station' not in description\n",
-    "              ] \n",
-    "    BRIDGE = [\"bridge\", 'viaduct']\n",
-    "    STREET = ['traffic signal', 'resurface', 'resurfacing', 'slurry', 'seal' \n",
-    "              'sign', 'stripe', 'striping', 'median', \n",
-    "              'guard rail', 'guardrail', \n",
-    "              'road', 'street', \n",
-    "              'sinkhole', 'intersection', 'signal', 'curb',\n",
-    "              'light', 'tree', 'pavement', 'roundabout'\n",
-    "             ]\n",
-    "\n",
-    "    FREEWAY = ['hov ', 'hot ', 'freeway', 'highway', 'express lanes', 'hwy']\n",
-    "\n",
-    "    INFRA_RESILIENCY_ER = ['repair', 'emergency', 'replace','retrofit', 'er',\n",
-    "                           'rehab', 'improvements', 'seismic', 'reconstruct', 'restoration']\n",
-    "\n",
-    "    CONGESTION_RELIEF = ['congestion', 'rideshare','ridesharing', 'vanpool', 'car share']\n",
-    "\n",
-    "    NOT_INC = ['charging', 'fueling', 'cng', 'bridge', 'trail',\n",
-    "           'k-rail', 'guardrails', 'bridge rail', 'guard', 'guarrail']\n",
-    "    \n",
-    "    PASSENGER_MODE = ['non sov', 'high quality transit areas', \n",
-    "                      'hqta', 'hov']\n",
-    "    \n",
-    "    \n",
-    "    SAFETY = ['fatalities','safe', 'speed management','signal coordination',\n",
-    "              'slow speeds', 'roundabouts', 'victims','collisoins','protect',\n",
-    "              'crash', 'modification factors', 'safety system'] \n",
-    "    \n",
-    "    def categorize_project_descriptions(row):\n",
-    "        \"\"\"\n",
-    "        This function takes a individual type of work description (row of a dataframe)\n",
-    "        and returns a dummy flag of 1 if it finds keyword present in\n",
-    "        project categories (active transportation, transit, bridge, etc).\n",
-    "        A description can contain multiple keywords across categories.\n",
-    "        \"\"\"\n",
-    "        # Clean up project description 2\n",
-    "        project_description = (row.project_description2.lower()\n",
-    "                               .replace(\"-\",\"\")\n",
-    "                               .replace(\".\",\"\")\n",
-    "                               .replace(\":\",\"\")\n",
-    "                              )\n",
-    "    \n",
-    "        # Store a bunch of columns that will be flagged\n",
-    "        # A project can involve multiple things...also, not sure what's in the descriptions\n",
-    "        active_transp = \"\"\n",
-    "        transit = \"\"\n",
-    "        bridge =\"\"\n",
-    "        street = \"\"\n",
-    "        freeway = \"\"\n",
-    "        infra_resiliency_er = \"\"\n",
-    "        congestion_relief = \"\"\n",
-    "        passenger_mode_shift = \"\"\n",
-    "        safety = \"\"\n",
-    "        \n",
-    "        if any(word in project_description for word in ACTIVE_TRANSPORTATION):\n",
-    "            active_transp = \"active transportation\"\n",
-    "        \n",
-    "        #if any(word in description if instanceof(word, str) else word(description) for word in TRANSIT)\n",
-    "\n",
-    "        if (any(word in project_description for word in TRANSIT) and \n",
-    "            not any(exclude_word in project_description for exclude_word in NOT_INC)\n",
-    "           ):\n",
-    "            transit = \"transit\"\n",
-    "        if any(word in project_description for word in BRIDGE):\n",
-    "            bridge = \"bridge\"\n",
-    "        if any(word in project_description for word in STREET):\n",
-    "            street = \"street\"\n",
-    "        if any(word in project_description for word in FREEWAY):\n",
-    "            freeway = \"freeway\" \n",
-    "        if any(word in project_description for word in INFRA_RESILIENCY_ER):\n",
-    "            infra_resiliency_er = \"infrastructure\"\n",
-    "        if any(word in project_description for word in CONGESTION_RELIEF):\n",
-    "            congestion_relief = \"congestion relief\"    \n",
-    "        if any(word in project_description for word in PASSENGER_MODE):\n",
-    "            passenger_mode_shift = \"passenger mode shift\"    \n",
-    "        if any(word in project_description for word in SAFETY):\n",
-    "            safety = \"safety\"    \n",
-    "        return pd.Series(\n",
-    "            [active_transp, transit, bridge, street, freeway, infra_resiliency_er, congestion_relief,\n",
-    "            passenger_mode_shift, safety], \n",
-    "            index=['active_transp', 'transit', 'bridge', 'street', \n",
-    "                   'freeway', 'infra_resiliency_er', 'congestion_relief',\n",
-    "                  'passenger_mode_shift', 'safety']\n",
-    "        )\n",
-    "    \n",
-    "    \n",
-    "    work_categories = df.apply(categorize_project_descriptions, axis=1)\n",
-    "    work_cols = list(work_categories.columns)\n",
-    "    df2 = pd.concat([df, work_categories], axis=1)\n",
-    "    \n",
-    "    df2['categories'] = df2[work_cols].agg(' '.join, axis=1)\n",
-    "    df2['categories'] = df2['categories'].str.strip()\n",
-    "    df2 = df2.drop(columns = work_cols)\n",
-    "    \n",
+    "def explode_dataframe(df:pd.DataFrame, column_to_explode:str)-> pd.DataFrame:\n",
+    "    df['Column2'] = df[column_to_explode].apply(lambda x: [int(i) if i.isdigit() else i for i in x.replace(',', '').split()])\n",
+    "    df = df.drop(columns = [column_to_explode])\n",
+    "    df2 = df.explode('Column2')\n",
+    "    df2 = df2.rename(columns = {'Column2': column_to_explode})\n",
     "    return df2"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 66,
-   "id": "6ea11daa-3a18-4d8a-9004-b2fc5e6d4343",
+   "execution_count": 49,
+   "id": "ea4fd105-9188-426f-a724-5de40ae5af4b",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects_metric = add_categories(all_projects)"
+    "sb1_assembly = separate_out_df(sb1_df, ['project_number', 'assemblydistrict'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 67,
-   "id": "f3856c74-228d-4cf8-929a-cac486024586",
+   "execution_count": 50,
+   "id": "6b621482-fb52-441f-9fde-bb021dda35c5",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>project_title</th>\n",
-       "      <th>lead_agency</th>\n",
-       "      <th>project_year</th>\n",
-       "      <th>project_category</th>\n",
-       "      <th>grant_program</th>\n",
-       "      <th>phase</th>\n",
-       "      <th>project_description</th>\n",
-       "      <th>total_project_cost_(millions)</th>\n",
-       "      <th>total_available_funds_(millions)</th>\n",
-       "      <th>unfunded_needs_(millions)</th>\n",
-       "      <th>city</th>\n",
-       "      <th>county</th>\n",
-       "      <th>location</th>\n",
-       "      <th>data_source</th>\n",
-       "      <th>notes</th>\n",
-       "      <th>funding_notes</th>\n",
-       "      <th>ct_district</th>\n",
-       "      <th>project_description2</th>\n",
-       "      <th>fully_funded</th>\n",
-       "      <th>enough_info</th>\n",
-       "      <th>counts</th>\n",
-       "      <th>categories</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>1886</th>\n",
-       "      <td>Safety - Hm4</td>\n",
-       "      <td>Caltrans</td>\n",
-       "      <td>21/22</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Hm</td>\n",
-       "      <td>In Progress</td>\n",
-       "      <td>Maintain/Repair Transportaiton Management Systems</td>\n",
-       "      <td>0.20</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>0.20</td>\n",
-       "      <td>Visalia</td>\n",
-       "      <td>Tulare</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Sb1</td>\n",
-       "      <td>Iijaprogram: None,  Dateupdated: 2022-09-19,  Isonshs: None,  Isonshscodes: N,  Agencies: Caltrans,  Popup:</td>\n",
-       "      <td>Sb1Funds: 0.0,  Iijafunds: 0.0</td>\n",
-       "      <td>06</td>\n",
-       "      <td>Maintain/Repair Transportaiton Management Systems</td>\n",
-       "      <td>No available funding info</td>\n",
-       "      <td>Yes</td>\n",
-       "      <td>number of strings in project desc: 5 % of null values:22</td>\n",
-       "      <td>infrastructure</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1119</th>\n",
-       "      <td>Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Freeway Safety And Congestion Relief Program</td>\n",
-       "      <td>Sacramento A2 2004</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>47.00</td>\n",
-       "      <td>0.00</td>\n",
-       "      <td>47.00</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Sacramento</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Local Options Sales Tax</td>\n",
-       "      <td>Notes: No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category.</td>\n",
-       "      <td>Estimated Lost Funds: 0.0,  Estimated Federal Funds: 0.0,  Estimated State Funds: 0.0,  Estimated Local Funds: 0,  Estimated Other Funds: 0.0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S</td>\n",
-       "      <td>No available funding info</td>\n",
-       "      <td>No</td>\n",
-       "      <td>number of strings in project desc: 1 % of null values:40</td>\n",
-       "      <td>transit</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1589</th>\n",
-       "      <td>Highway 101: Betteravia Road Interchange</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Santa Barbara A 2008</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.</td>\n",
-       "      <td>2.00</td>\n",
-       "      <td>5.00</td>\n",
-       "      <td>-3.00</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Santa Barbara</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Local Options Sales Tax</td>\n",
-       "      <td>Notes: Nan</td>\n",
-       "      <td>Estimated Lost Funds: 2.0,  Estimated Federal Funds: 0.0,  Estimated State Funds: 0.0,  Estimated Local Funds: 0,  Estimated Other Funds: 3.0</td>\n",
-       "      <td>None</td>\n",
-       "      <td>Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.</td>\n",
-       "      <td>Fully funded</td>\n",
-       "      <td>No</td>\n",
-       "      <td>number of strings in project desc: 24 % of null values:36</td>\n",
-       "      <td>street freeway infrastructure</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                            project_title lead_agency  \\\n",
-       "1886                                         Safety - Hm4    Caltrans   \n",
-       "1119  Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S        None   \n",
-       "1589             Highway 101: Betteravia Road Interchange        None   \n",
-       "\n",
-       "     project_year                              project_category  \\\n",
-       "1886        21/22                                          None   \n",
-       "1119         None  Freeway Safety And Congestion Relief Program   \n",
-       "1589         None                                          None   \n",
-       "\n",
-       "             grant_program        phase  \\\n",
-       "1886                    Hm  In Progress   \n",
-       "1119    Sacramento A2 2004         None   \n",
-       "1589  Santa Barbara A 2008         None   \n",
-       "\n",
-       "                                                                                                                                                          project_description  \\\n",
-       "1886                                                                                                                        Maintain/Repair Transportaiton Management Systems   \n",
-       "1119                                                                                                                                                                     None   \n",
-       "1589  Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.   \n",
-       "\n",
-       "      total_project_cost_(millions)  total_available_funds_(millions)  \\\n",
-       "1886                           0.20                              0.00   \n",
-       "1119                          47.00                              0.00   \n",
-       "1589                           2.00                              5.00   \n",
-       "\n",
-       "      unfunded_needs_(millions)     city         county location  \\\n",
-       "1886                       0.20  Visalia         Tulare     None   \n",
-       "1119                      47.00     None     Sacramento     None   \n",
-       "1589                      -3.00     None  Santa Barbara     None   \n",
-       "\n",
-       "                  data_source  \\\n",
-       "1886                      Sb1   \n",
-       "1119  Local Options Sales Tax   \n",
-       "1589  Local Options Sales Tax   \n",
-       "\n",
-       "                                                                                                                                      notes  \\\n",
-       "1886                            Iijaprogram: None,  Dateupdated: 2022-09-19,  Isonshs: None,  Isonshscodes: N,  Agencies: Caltrans,  Popup:   \n",
-       "1119  Notes: No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category.   \n",
-       "1589                                                                                                                             Notes: Nan   \n",
-       "\n",
-       "                                                                                                                                      funding_notes  \\\n",
-       "1886                                                                                                                 Sb1Funds: 0.0,  Iijafunds: 0.0   \n",
-       "1119  Estimated Lost Funds: 0.0,  Estimated Federal Funds: 0.0,  Estimated State Funds: 0.0,  Estimated Local Funds: 0,  Estimated Other Funds: 0.0   \n",
-       "1589  Estimated Lost Funds: 2.0,  Estimated Federal Funds: 0.0,  Estimated State Funds: 0.0,  Estimated Local Funds: 0,  Estimated Other Funds: 3.0   \n",
-       "\n",
-       "     ct_district  \\\n",
-       "1886          06   \n",
-       "1119        None   \n",
-       "1589        None   \n",
-       "\n",
-       "                                                                                                                                                         project_description2  \\\n",
-       "1886                                                                                                                        Maintain/Repair Transportaiton Management Systems   \n",
-       "1119                                                                                                                      Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S   \n",
-       "1589  Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.   \n",
-       "\n",
-       "                   fully_funded enough_info  \\\n",
-       "1886  No available funding info         Yes   \n",
-       "1119  No available funding info          No   \n",
-       "1589               Fully funded          No   \n",
-       "\n",
-       "                                                         counts  \\\n",
-       "1886   number of strings in project desc: 5 % of null values:22   \n",
-       "1119   number of strings in project desc: 1 % of null values:40   \n",
-       "1589  number of strings in project desc: 24 % of null values:36   \n",
-       "\n",
-       "                         categories  \n",
-       "1886                 infrastructure  \n",
-       "1119                        transit  \n",
-       "1589  street freeway infrastructure  "
-      ]
-     },
-     "execution_count": 67,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "all_projects_metric.drop(columns = ['geometry']).sample(3)"
+    "sb1_assembly = explode_dataframe(sb1_assembly, 'assemblydistrict')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 69,
-   "id": "ad99b589-1d78-4052-96ac-4617f0494544",
+   "execution_count": 51,
+   "id": "a0e95618-ea4f-4b7d-b1fc-954a780dd86f",
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "infrastructure                                                    1436\n",
-       "                                                                  1381\n",
-       "street  infrastructure                                             739\n",
-       "street                                                             372\n",
-       "bridge   infrastructure                                            226\n",
-       "transit    infrastructure                                          201\n",
-       "active transportation   street  infrastructure                     106\n",
-       "transit                                                             75\n",
-       "street  infrastructure   safety                                     58\n",
-       "transit  street  infrastructure                                     52\n",
-       "freeway infrastructure                                              52\n",
-       "bridge street  infrastructure                                       45\n",
-       "bridge                                                              44\n",
-       "active transportation     infrastructure                            44\n",
-       "active transportation                                               42\n",
-       "street freeway infrastructure   safety                              41\n",
-       "street freeway infrastructure                                       37\n",
-       "infrastructure   safety                                             36\n",
-       "active transportation   street  infrastructure   safety             29\n",
-       "freeway infrastructure  passenger mode shift                        22\n",
-       "active transportation transit    infrastructure                     21\n",
-       "freeway                                                             20\n",
-       "freeway infrastructure   safety                                     18\n",
-       "active transportation transit  street  infrastructure               17\n",
-       "bridge street  infrastructure   safety                              14\n",
-       "bridge   infrastructure   safety                                    12\n",
-       "street  infrastructure congestion relief                            11\n",
-       "active transportation transit  street  infrastructure   safety      11\n",
-       "passenger mode shift                                                11\n",
-       "street     safety                                                   10\n",
-       "Name: categories, dtype: int64"
-      ]
-     },
-     "execution_count": 69,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
-    "all_projects_metric.categories.value_counts().head(30)"
+    "sb1_senate = separate_out_df(sb1_df, ['project_number', 'senatedistrict'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 88,
-   "id": "270e8b35-cc6b-4461-835c-40c4b850916d",
+   "execution_count": 52,
+   "id": "1d67d259-8822-4512-877b-bc1b66de7d96",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def apply_metrics(df):\n",
-    "    def categorize_metrics(row):\n",
-    "        categories = row.categories.lower()\n",
-    "        safety = \"\"\n",
-    "        passenger_mode_shift = \"\"\n",
-    "        infill_development = \"\"\n",
-    "        \n",
-    "        if any(word in categories for word in ['infrastructure','safety',]):\n",
-    "            safety = \"safety\"\n",
-    "        if any(word in categories for word in ['active transportation', 'passenger_mode_shift', \"congestion relief\"]):\n",
-    "            passenger_mode_shift = \"passenger_mode_shift\"\n",
-    "        if any(word in categories for word in ['transit', 'active transportation',]):\n",
-    "            infill_development = \"infill_development\" \n",
-    "       \n",
-    "        return pd.Series(\n",
-    "            [safety,passenger_mode_shift,infill_development], \n",
-    "            index=['safety', 'passenger_mode_shift', 'infill_development']\n",
-    "        )\n",
-    "    \n",
-    "    work_categories = df.apply(categorize_metrics, axis=1)\n",
-    "    work_cols = list(work_categories.columns)\n",
-    "    df2 = pd.concat([df, work_categories], axis=1)\n",
-    "    \n",
-    "    df2['applicable_metrics'] = df2[work_cols].agg(' '.join, axis=1)\n",
-    "    df2['applicable_metrics'] = df2['applicable_metrics'].str.strip()\n",
-    "    df2 = df2.drop(columns = work_cols)\n",
-    "    \n",
-    "    return df2"
+    "sb1_senate = explode_dataframe(sb1_senate, 'senatedistrict')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 89,
-   "id": "9a643de4-b6b3-4751-9a9f-b68abe4d7a22",
+   "execution_count": 53,
+   "id": "137fda15-a412-49fe-be74-48d58f750bb3",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects_metric = apply_metrics(all_projects_metric)"
+    "sb1_city = separate_out_df(sb1_df, ['project_number', 'cityname'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 90,
-   "id": "a6da3b49-dd3f-4b01-b394-23f44bf8e3a6",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
+   "execution_count": 56,
+   "id": "d98c2dc2-3232-4fe0-ba3f-6f2d148fb755",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sb1_geo = separate_out_df(sb1_df, ['project_number', 'geometry'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "67e7d80d-ee5c-4c1b-8337-bb30be56f585",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sb1_awards = separate_out_df(sb1_df, ['project_number', 'projprogram'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "ab21d511-2062-4c89-ad45-dfdc95721cc1",
+   "metadata": {},
    "outputs": [
     {
      "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>grant_program</th>\n",
-       "      <th>project_description2</th>\n",
-       "      <th>categories</th>\n",
-       "      <th>applicable_metrics</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>2587</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $4.91 Million Dollar Project In Santa Barbara County On Route 154 Will Place High Friction Surface Treatment (Hfst) And Construct Centerline Rumble Strip.</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1058</th>\n",
-       "      <td>Lsr</td>\n",
-       "      <td>None</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>State Rail Plan</td>\n",
-       "      <td>Expansion Of The Smart Fleet To Accommodate Service Expansion.</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>845</th>\n",
-       "      <td>Imperial D 2008</td>\n",
-       "      <td>Overlay</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1933</th>\n",
-       "      <td>Hm</td>\n",
-       "      <td>Maintain/Repair Pavement - Seal Coat</td>\n",
-       "      <td>street  infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2032</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $11.57 Million Dollar Project In Humboldt County On Route 299 Will Widen Shoulders.</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>331</th>\n",
-       "      <td>Sgr</td>\n",
-       "      <td>None</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3222</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $5.8 Million Dollar Project In San Diego County On Route 5 Will Apply Polyester Concrete Overlay To Bridge Decks, Apply Methacrylate To Approach Slabs, And Repair Spalls. (Bridge Deck Preservation)</td>\n",
-       "      <td>bridge   infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>106</th>\n",
-       "      <td>State Rail Plan</td>\n",
-       "      <td>Double Track From Mp 436.65 To Cp Santa Susana To Allow At-Speed Meets At 437.4. Add 2Nd Platform At Simi Valley Station To Allow Boarding From Both Tracks.</td>\n",
-       "      <td>transit</td>\n",
-       "      <td>infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1092</th>\n",
-       "      <td>Lsr</td>\n",
-       "      <td>None</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1703</th>\n",
-       "      <td>Stip</td>\n",
-       "      <td>Near The City Of Tulare, On Route 65 From Lindsay To Exeter, And On Road 204 From Route 137 To Route 198.  Widen To 4 Divided Lanes And Realign Highway.</td>\n",
-       "      <td>street freeway infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>753</th>\n",
-       "      <td>Imperial D 2008</td>\n",
-       "      <td>Overlay</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>43</th>\n",
-       "      <td>State Rail Plan</td>\n",
-       "      <td>Caltrain Electrification Will Electrify The Corridor From San Francisco Caltrain Station To The Tamien Caltrain Station. Electrification Improvements Include Converting Diesel-Hauled Trains To Electric Trains, Increasing Service To Six Trains Per Peak Hour Per Direction, And Maintaining Operating Speed Up To 79 Mph.</td>\n",
-       "      <td>transit    infrastructure</td>\n",
-       "      <td>safety  infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>627</th>\n",
-       "      <td>Hm</td>\n",
-       "      <td>Maintain/Repair Transportaiton Management Systems</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>900</th>\n",
-       "      <td>Los Angeles Angeles M 2016</td>\n",
-       "      <td>Transportation System And Mobility Improve Program</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2016</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $7.4 Million Dollar Project In Mendocino County On Route 1 Will Widen For Standard Shoulders, Improve Roadway Cross Slope, And Install Rumble Strips And Guardrail.</td>\n",
-       "      <td>street  infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1759</th>\n",
-       "      <td>Hm</td>\n",
-       "      <td>Repair/Replace Culverts</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2082</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $24.63 Million Dollar Project In Mendocino County On Route 128 Will Rehabilitate Pavement, Upgrade Transportation Management System (Tms) Elements, Guardrails, And Sign Panels, Upgrade Facilities To Americans With Disabilities Act (Ada) Standards, And Construct Complete Streets Elements.</td>\n",
-       "      <td>street  infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1488</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $6.28 Million Dollar Project In San Bernardino County On Route Var Will Upgrade Traffic Census Stations.</td>\n",
-       "      <td>transit    infrastructure</td>\n",
-       "      <td>safety  infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1886</th>\n",
-       "      <td>Hm</td>\n",
-       "      <td>Maintain/Repair Transportaiton Management Systems</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>311</th>\n",
-       "      <td>Imperial D 2008</td>\n",
-       "      <td>Reconstruction</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1335</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $4.92 Million Dollar Project In San Mateo County On Route 82 Will Upgrade Bridge Rails And Signals And Upgrade Facilities To Americans With Disabilities Act (Ada) Standards.</td>\n",
-       "      <td>bridge street  infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>487</th>\n",
-       "      <td>Hm</td>\n",
-       "      <td>Maintain/Repair Maintenance Station</td>\n",
-       "      <td>transit    infrastructure</td>\n",
-       "      <td>safety  infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>471</th>\n",
-       "      <td>Sgr</td>\n",
-       "      <td>None</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>901</th>\n",
-       "      <td>Los Angeles Angeles M 2016</td>\n",
-       "      <td>Active Transportation 1St/Last Mile Connections Prog</td>\n",
-       "      <td>active transportation</td>\n",
-       "      <td>passenger_mode_shift infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>456</th>\n",
-       "      <td>Imperial D 2008</td>\n",
-       "      <td>Lincoln Ave From Rose Ave To Weakley St S</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1140</th>\n",
-       "      <td>Lsr</td>\n",
-       "      <td>None</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3275</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $36.09 Million Dollar Project In Orange County On Route 22 Will Install Safety Lighting And Upgrade Median Barrier, Drainage Systems, And Signs.</td>\n",
-       "      <td>street  infrastructure   safety</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3176</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $4.53 Million Dollar Project In San Diego County On Route 94 Will Construct And Upgrade Pedestrian Curb Ramps And Sidewalks To Meet Current Standards.</td>\n",
-       "      <td>active transportation   street</td>\n",
-       "      <td>passenger_mode_shift infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1540</th>\n",
-       "      <td>San Mateo W 2018</td>\n",
-       "      <td>Pedestrian Accessibility Improvements Citywide</td>\n",
-       "      <td>active transportation     infrastructure</td>\n",
-       "      <td>safety passenger_mode_shift infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1318</th>\n",
-       "      <td>San Diego A 2004</td>\n",
-       "      <td>8F+2Hov</td>\n",
-       "      <td>passenger mode shift</td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3135</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $2.87 Million Dollar Project In San Joaquin County On Route 99 Will Apply High Friction Surface Treatment (Hfst).</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>320</th>\n",
-       "      <td>Imperial D 2008</td>\n",
-       "      <td>Overlay</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1633</th>\n",
-       "      <td>Tcep</td>\n",
-       "      <td>In San Bernardino And Riverside Counties Through The Cities Of Eastvale, Jurupa Valley, Ontario, And Rancho Cucamonga, On I-15 From Cantu Galleano Road To Foothill Boulevard. This Project Will Construct The Toll System  Needed To Operate The I-15 Express Lanes Project (08-0167M).</td>\n",
-       "      <td>street freeway infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1376</th>\n",
-       "      <td>San Joaquin K 2003</td>\n",
-       "      <td>Widen From 2 To 4 Lanes Between I-5 And The Lodi City Limits</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3036</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $4.57 Million Dollar Project In Riverside County On Route 60 Will Reduce Wrong-Way Collisions By Installing Wrong-Way Pavement Markers And Sign Panels, And Upgrading Pavement Markings At Onramps And Offramps.</td>\n",
-       "      <td>street  infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>826</th>\n",
-       "      <td>Imperial D 2008</td>\n",
-       "      <td>Reconstruct</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>340</th>\n",
-       "      <td>Imperial D 2008</td>\n",
-       "      <td>Crack Seal/Slurry Coat</td>\n",
-       "      <td>street</td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2405</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $7.14 Million Dollar Project In Santa Clara County On Route 152 Will Rehabilitate Drainage Systems, Upgrade Guardrail, And Pave Roadside Areas To Prevent Vegetation Growth And Enhance Highway Worker Safety.</td>\n",
-       "      <td>street freeway infrastructure   safety</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>814</th>\n",
-       "      <td>Imperial D 2008</td>\n",
-       "      <td>Overlay</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>44</th>\n",
-       "      <td>Tircp</td>\n",
-       "      <td>None</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1041</th>\n",
-       "      <td>Lsr</td>\n",
-       "      <td>None</td>\n",
-       "      <td></td>\n",
-       "      <td></td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>134</th>\n",
-       "      <td>State Rail Plan</td>\n",
-       "      <td>Design And Construct A New Station And Platform In The Oakley Civic Center On The San Joaquins Route Between Oakland And Stockton.  This Station Is Five Miles From The Existing Antioch/Pitsburg Station And Will Serve The Communities Of Oakley And Brentwood.</td>\n",
-       "      <td>transit    infrastructure</td>\n",
-       "      <td>safety  infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2009</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $104.39 Million Dollar Project In Del Norte County On Route 101 Will Replace Bridge.</td>\n",
-       "      <td>bridge   infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>12</th>\n",
-       "      <td>Alameda B 2000</td>\n",
-       "      <td>I-580 Interchange Improvements In Castro Valley</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2107</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $9.16 Million Dollar Project In Mendocino County On Route 020 Will Safety Improvements.</td>\n",
-       "      <td>infrastructure   safety</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1371</th>\n",
-       "      <td>Shopp</td>\n",
-       "      <td>A $9.7 Million Dollar Project In Santa Barbara County On Route 154 Will Replace Bridge.</td>\n",
-       "      <td>bridge   infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>973</th>\n",
-       "      <td>Los Angeles Angeles R 2008</td>\n",
-       "      <td>Bus Operations (Countywide Bus Service Operations,\\nMaintenance, And Expansion. Suspend A Scheduled\\nJuly 1, 2009 Metro Fare Increase For One Year And\\nFreeze All Metro Student, Senior, Disabled, And\\nMedicare Fares Through June 30, 2013 By Instead\\nUsing Metro'S Formula Allocation Procedure Share Of\\nThis Subfund.)</td>\n",
-       "      <td>transit    infrastructure</td>\n",
-       "      <td>safety  infill_development</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1823</th>\n",
-       "      <td>Tulare R 2006</td>\n",
-       "      <td>Over Crossing</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>97</th>\n",
-       "      <td>State Rail Plan</td>\n",
-       "      <td>Double Track Between Cp Canyon (Newhall Siding)  And Cp Hood (Canyon) To Allow 15- Minute Service Between La And Santa Clarita.</td>\n",
-       "      <td>infrastructure</td>\n",
-       "      <td>safety</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
       "text/plain": [
-       "                   grant_program  \\\n",
-       "2587                       Shopp   \n",
-       "1058                         Lsr   \n",
-       "17               State Rail Plan   \n",
-       "845              Imperial D 2008   \n",
-       "1933                          Hm   \n",
-       "2032                       Shopp   \n",
-       "331                          Sgr   \n",
-       "3222                       Shopp   \n",
-       "106              State Rail Plan   \n",
-       "1092                         Lsr   \n",
-       "1703                        Stip   \n",
-       "753              Imperial D 2008   \n",
-       "43               State Rail Plan   \n",
-       "627                           Hm   \n",
-       "900   Los Angeles Angeles M 2016   \n",
-       "2016                       Shopp   \n",
-       "1759                          Hm   \n",
-       "2082                       Shopp   \n",
-       "1488                       Shopp   \n",
-       "1886                          Hm   \n",
-       "311              Imperial D 2008   \n",
-       "1335                       Shopp   \n",
-       "487                           Hm   \n",
-       "471                          Sgr   \n",
-       "901   Los Angeles Angeles M 2016   \n",
-       "456              Imperial D 2008   \n",
-       "1140                         Lsr   \n",
-       "3275                       Shopp   \n",
-       "3176                       Shopp   \n",
-       "1540            San Mateo W 2018   \n",
-       "1318            San Diego A 2004   \n",
-       "3135                       Shopp   \n",
-       "320              Imperial D 2008   \n",
-       "1633                        Tcep   \n",
-       "1376          San Joaquin K 2003   \n",
-       "3036                       Shopp   \n",
-       "826              Imperial D 2008   \n",
-       "340              Imperial D 2008   \n",
-       "2405                       Shopp   \n",
-       "814              Imperial D 2008   \n",
-       "44                         Tircp   \n",
-       "1041                         Lsr   \n",
-       "134              State Rail Plan   \n",
-       "2009                       Shopp   \n",
-       "12                Alameda B 2000   \n",
-       "2107                       Shopp   \n",
-       "1371                       Shopp   \n",
-       "973   Los Angeles Angeles R 2008   \n",
-       "1823               Tulare R 2006   \n",
-       "97               State Rail Plan   \n",
-       "\n",
-       "                                                                                                                                                                                                                                                                                                               project_description2  \\\n",
-       "2587                                                                                                                                                                   A $4.91 Million Dollar Project In Santa Barbara County On Route 154 Will Place High Friction Surface Treatment (Hfst) And Construct Centerline Rumble Strip.   \n",
-       "1058                                                                                                                                                                                                                                                                                                                           None   \n",
-       "17                                                                                                                                                                                                                                                                   Expansion Of The Smart Fleet To Accommodate Service Expansion.   \n",
-       "845                                                                                                                                                                                                                                                                                                                         Overlay   \n",
-       "1933                                                                                                                                                                                                                                                                                           Maintain/Repair Pavement - Seal Coat   \n",
-       "2032                                                                                                                                                                                                                                          A $11.57 Million Dollar Project In Humboldt County On Route 299 Will Widen Shoulders.   \n",
-       "331                                                                                                                                                                                                                                                                                                                            None   \n",
-       "3222                                                                                                                        A $5.8 Million Dollar Project In San Diego County On Route 5 Will Apply Polyester Concrete Overlay To Bridge Decks, Apply Methacrylate To Approach Slabs, And Repair Spalls. (Bridge Deck Preservation)   \n",
-       "106                                                                                                                                                                    Double Track From Mp 436.65 To Cp Santa Susana To Allow At-Speed Meets At 437.4. Add 2Nd Platform At Simi Valley Station To Allow Boarding From Both Tracks.   \n",
-       "1092                                                                                                                                                                                                                                                                                                                           None   \n",
-       "1703                                                                                                                                                                       Near The City Of Tulare, On Route 65 From Lindsay To Exeter, And On Road 204 From Route 137 To Route 198.  Widen To 4 Divided Lanes And Realign Highway.   \n",
-       "753                                                                                                                                                                                                                                                                                                                         Overlay   \n",
-       "43    Caltrain Electrification Will Electrify The Corridor From San Francisco Caltrain Station To The Tamien Caltrain Station. Electrification Improvements Include Converting Diesel-Hauled Trains To Electric Trains, Increasing Service To Six Trains Per Peak Hour Per Direction, And Maintaining Operating Speed Up To 79 Mph.   \n",
-       "627                                                                                                                                                                                                                                                                               Maintain/Repair Transportaiton Management Systems   \n",
-       "900                                                                                                                                                                                                                                                                              Transportation System And Mobility Improve Program   \n",
-       "2016                                                                                                                                                          A $7.4 Million Dollar Project In Mendocino County On Route 1 Will Widen For Standard Shoulders, Improve Roadway Cross Slope, And Install Rumble Strips And Guardrail.   \n",
-       "1759                                                                                                                                                                                                                                                                                                        Repair/Replace Culverts   \n",
-       "2082                             A $24.63 Million Dollar Project In Mendocino County On Route 128 Will Rehabilitate Pavement, Upgrade Transportation Management System (Tms) Elements, Guardrails, And Sign Panels, Upgrade Facilities To Americans With Disabilities Act (Ada) Standards, And Construct Complete Streets Elements.   \n",
-       "1488                                                                                                                                                                                                                     A $6.28 Million Dollar Project In San Bernardino County On Route Var Will Upgrade Traffic Census Stations.   \n",
-       "1886                                                                                                                                                                                                                                                                              Maintain/Repair Transportaiton Management Systems   \n",
-       "311                                                                                                                                                                                                                                                                                                                  Reconstruction   \n",
-       "1335                                                                                                                                                A $4.92 Million Dollar Project In San Mateo County On Route 82 Will Upgrade Bridge Rails And Signals And Upgrade Facilities To Americans With Disabilities Act (Ada) Standards.   \n",
-       "487                                                                                                                                                                                                                                                                                             Maintain/Repair Maintenance Station   \n",
-       "471                                                                                                                                                                                                                                                                                                                            None   \n",
-       "901                                                                                                                                                                                                                                                                            Active Transportation 1St/Last Mile Connections Prog   \n",
-       "456                                                                                                                                                                                                                                                                                       Lincoln Ave From Rose Ave To Weakley St S   \n",
-       "1140                                                                                                                                                                                                                                                                                                                           None   \n",
-       "3275                                                                                                                                                                             A $36.09 Million Dollar Project In Orange County On Route 22 Will Install Safety Lighting And Upgrade Median Barrier, Drainage Systems, And Signs.   \n",
-       "3176                                                                                                                                                                       A $4.53 Million Dollar Project In San Diego County On Route 94 Will Construct And Upgrade Pedestrian Curb Ramps And Sidewalks To Meet Current Standards.   \n",
-       "1540                                                                                                                                                                                                                                                                                 Pedestrian Accessibility Improvements Citywide   \n",
-       "1318                                                                                                                                                                                                                                                                                                                        8F+2Hov   \n",
-       "3135                                                                                                                                                                                                            A $2.87 Million Dollar Project In San Joaquin County On Route 99 Will Apply High Friction Surface Treatment (Hfst).   \n",
-       "320                                                                                                                                                                                                                                                                                                                         Overlay   \n",
-       "1633                                       In San Bernardino And Riverside Counties Through The Cities Of Eastvale, Jurupa Valley, Ontario, And Rancho Cucamonga, On I-15 From Cantu Galleano Road To Foothill Boulevard. This Project Will Construct The Toll System  Needed To Operate The I-15 Express Lanes Project (08-0167M).   \n",
-       "1376                                                                                                                                                                                                                                                                   Widen From 2 To 4 Lanes Between I-5 And The Lodi City Limits   \n",
-       "3036                                                                                                             A $4.57 Million Dollar Project In Riverside County On Route 60 Will Reduce Wrong-Way Collisions By Installing Wrong-Way Pavement Markers And Sign Panels, And Upgrading Pavement Markings At Onramps And Offramps.   \n",
-       "826                                                                                                                                                                                                                                                                                                                     Reconstruct   \n",
-       "340                                                                                                                                                                                                                                                                                                          Crack Seal/Slurry Coat   \n",
-       "2405                                                                                                               A $7.14 Million Dollar Project In Santa Clara County On Route 152 Will Rehabilitate Drainage Systems, Upgrade Guardrail, And Pave Roadside Areas To Prevent Vegetation Growth And Enhance Highway Worker Safety.   \n",
-       "814                                                                                                                                                                                                                                                                                                                         Overlay   \n",
-       "44                                                                                                                                                                                                                                                                                                                             None   \n",
-       "1041                                                                                                                                                                                                                                                                                                                           None   \n",
-       "134                                                               Design And Construct A New Station And Platform In The Oakley Civic Center On The San Joaquins Route Between Oakland And Stockton.  This Station Is Five Miles From The Existing Antioch/Pitsburg Station And Will Serve The Communities Of Oakley And Brentwood.   \n",
-       "2009                                                                                                                                                                                                                                         A $104.39 Million Dollar Project In Del Norte County On Route 101 Will Replace Bridge.   \n",
-       "12                                                                                                                                                                                                                                                                                  I-580 Interchange Improvements In Castro Valley   \n",
-       "2107                                                                                                                                                                                                                                      A $9.16 Million Dollar Project In Mendocino County On Route 020 Will Safety Improvements.   \n",
-       "1371                                                                                                                                                                                                                                        A $9.7 Million Dollar Project In Santa Barbara County On Route 154 Will Replace Bridge.   \n",
-       "973   Bus Operations (Countywide Bus Service Operations,\\nMaintenance, And Expansion. Suspend A Scheduled\\nJuly 1, 2009 Metro Fare Increase For One Year And\\nFreeze All Metro Student, Senior, Disabled, And\\nMedicare Fares Through June 30, 2013 By Instead\\nUsing Metro'S Formula Allocation Procedure Share Of\\nThis Subfund.)   \n",
-       "1823                                                                                                                                                                                                                                                                                                                  Over Crossing   \n",
-       "97                                                                                                                                                                                                  Double Track Between Cp Canyon (Newhall Siding)  And Cp Hood (Canyon) To Allow 15- Minute Service Between La And Santa Clarita.   \n",
-       "\n",
-       "                                    categories  \\\n",
-       "2587                            infrastructure   \n",
-       "1058                                             \n",
-       "17                              infrastructure   \n",
-       "845                             infrastructure   \n",
-       "1933                    street  infrastructure   \n",
-       "2032                            infrastructure   \n",
-       "331                                              \n",
-       "3222                   bridge   infrastructure   \n",
-       "106                                    transit   \n",
-       "1092                                             \n",
-       "1703             street freeway infrastructure   \n",
-       "753                             infrastructure   \n",
-       "43                   transit    infrastructure   \n",
-       "627                             infrastructure   \n",
-       "900                                              \n",
-       "2016                    street  infrastructure   \n",
-       "1759                            infrastructure   \n",
-       "2082                    street  infrastructure   \n",
-       "1488                 transit    infrastructure   \n",
-       "1886                            infrastructure   \n",
-       "311                             infrastructure   \n",
-       "1335             bridge street  infrastructure   \n",
-       "487                  transit    infrastructure   \n",
-       "471                                              \n",
-       "901                      active transportation   \n",
-       "456                                              \n",
-       "1140                                             \n",
-       "3275           street  infrastructure   safety   \n",
-       "3176            active transportation   street   \n",
-       "1540  active transportation     infrastructure   \n",
-       "1318                      passenger mode shift   \n",
-       "3135                                             \n",
-       "320                             infrastructure   \n",
-       "1633             street freeway infrastructure   \n",
-       "1376                                             \n",
-       "3036                    street  infrastructure   \n",
-       "826                             infrastructure   \n",
-       "340                                     street   \n",
-       "2405    street freeway infrastructure   safety   \n",
-       "814                             infrastructure   \n",
-       "44                                               \n",
-       "1041                                             \n",
-       "134                  transit    infrastructure   \n",
-       "2009                   bridge   infrastructure   \n",
-       "12                              infrastructure   \n",
-       "2107                   infrastructure   safety   \n",
-       "1371                   bridge   infrastructure   \n",
-       "973                  transit    infrastructure   \n",
-       "1823                            infrastructure   \n",
-       "97                              infrastructure   \n",
-       "\n",
-       "                                  applicable_metrics  \n",
-       "2587                                          safety  \n",
-       "1058                                                  \n",
-       "17                                            safety  \n",
-       "845                                           safety  \n",
-       "1933                                          safety  \n",
-       "2032                                          safety  \n",
-       "331                                                   \n",
-       "3222                                          safety  \n",
-       "106                               infill_development  \n",
-       "1092                                                  \n",
-       "1703                                          safety  \n",
-       "753                                           safety  \n",
-       "43                        safety  infill_development  \n",
-       "627                                           safety  \n",
-       "900                                                   \n",
-       "2016                                          safety  \n",
-       "1759                                          safety  \n",
-       "2082                                          safety  \n",
-       "1488                      safety  infill_development  \n",
-       "1886                                          safety  \n",
-       "311                                           safety  \n",
-       "1335                                          safety  \n",
-       "487                       safety  infill_development  \n",
-       "471                                                   \n",
-       "901          passenger_mode_shift infill_development  \n",
-       "456                                                   \n",
-       "1140                                                  \n",
-       "3275                                          safety  \n",
-       "3176         passenger_mode_shift infill_development  \n",
-       "1540  safety passenger_mode_shift infill_development  \n",
-       "1318                                                  \n",
-       "3135                                                  \n",
-       "320                                           safety  \n",
-       "1633                                          safety  \n",
-       "1376                                                  \n",
-       "3036                                          safety  \n",
-       "826                                           safety  \n",
-       "340                                                   \n",
-       "2405                                          safety  \n",
-       "814                                           safety  \n",
-       "44                                                    \n",
-       "1041                                                  \n",
-       "134                       safety  infill_development  \n",
-       "2009                                          safety  \n",
-       "12                                            safety  \n",
-       "2107                                          safety  \n",
-       "1371                                          safety  \n",
-       "973                       safety  infill_development  \n",
-       "1823                                          safety  \n",
-       "97                                            safety  "
+       "((9186, 2), (1585, 2), (6696, 2))"
       ]
      },
-     "execution_count": 90,
+     "execution_count": 58,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_projects_metric[['grant_program','project_description2','categories','applicable_metrics']].sample(50)"
+    "sb1_awards.shape, sb1_geo.shape, sb1_city.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "d09e706e-cf16-4434-8b4b-b2661f0ba742",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sb1_agencies = separate_out_df(sb1_df, ['project_number', 'projagency', 'appagencyname', 'impagencyname'])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 86,
-   "id": "9ea21916-dd50-4396-850b-87ea2535c9f4",
+   "execution_count": 60,
+   "id": "9a07f377-dc3d-402c-8c26-2f7def5abb41",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "7"
+       "(9186, 4)"
       ]
      },
-     "execution_count": 86,
+     "execution_count": 60,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_projects_metric.applicable_metrics.nunique()"
+    "sb1_agencies.shape"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "1ad5b6ae-9407-46ae-b2ff-c9ad6cbea83c",
+   "id": "1150a152-1432-40b9-a946-6a79964bf720",
    "metadata": {},
    "source": [
-    "### Categorization"
+    "### LP2000"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 43,
-   "id": "f9a86d28-9b77-48c3-ba6d-27dc360f2fd0",
+   "execution_count": 79,
+   "id": "b1c94570-cb6c-4e47-91c9-59caf8512dc7",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def get_list_of_words(df, col: str) -> list:\n",
-    "    \"\"\"\n",
-    "    Natalie's function to clean and place words in a project description column\n",
-    "    into a list\n",
-    "    \"\"\"\n",
-    "    # get just the one col\n",
-    "    column = df[[col]]\n",
-    "\n",
-    "    # remove single-dimensional entries from the shape of an array\n",
-    "    col_text = column.squeeze()\n",
-    "    # get list of words\n",
-    "    text_list = col_text.tolist()\n",
-    "\n",
-    "    # Join all the column into one large text blob, lower text\n",
-    "    text_list = \" \".join(text_list).lower()\n",
-    "\n",
-    "    # remove punctuation\n",
-    "    text_list = re.sub(r\"[^\\w\\s]\", \"\", text_list)\n",
+    "def load_lp2000(file: str):\n",
+    "    \n",
+    "    df_project = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"project\")\n",
+    "    )\n",
     "\n",
-    "    # List of stopwords\n",
-    "    swords = [re.sub(r\"[^A-z\\s]\", \"\", sword) for sword in stopwords.words(\"english\")]\n",
+    "    df_county = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"county\")\n",
+    "    ).drop(columns=[\"project_label_name\"])\n",
     "\n",
-    "    # Remove stopwords\n",
-    "    clean_text_list = [\n",
-    "        word for word in word_tokenize(text_list.lower()) if word not in swords\n",
-    "    ]\n",
+    "    df_district = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"district\")\n",
+    "    ).drop(columns=[\"project_label_name\"])\n",
     "\n",
-    "    return clean_text_list"
+    "    df_award = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"awards\")\n",
+    "    )\n",
+    "    \n",
+    "    df_phase = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"phase_funding\")\n",
+    "    )\n",
+    "    \n",
+    "    return df_project, df_county, df_district, df_award, df_phase"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 44,
-   "id": "cd602787-2444-49c5-8bb8-c59a63975de5",
+   "execution_count": 80,
+   "id": "91c989ec-208b-4053-8ae8-3e3a0e44c5c8",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def find_common_phrases(df, description_column: str, values_to_add: list):\n",
-    "\n",
-    "    # Break apart every word in the description column into a list\n",
-    "    descriptions_list = get_list_of_words(df, description_column)\n",
-    "\n",
-    "    # Get phrases of whatever length you want (2,3,4,etc)\n",
-    "    c = Counter([\" \".join(y) for x in [2] for y in ngrams(descriptions_list, x)])\n",
-    "\n",
-    "    # Make a dataframe out of the counter values\n",
-    "    df_phrases = pd.DataFrame({\"phrases\": list(c.keys()), \"total\": list(c.values())})\n",
-    "\n",
-    "    # Take phrases that are repeated more than 40 times and turn it into a list\n",
-    "    df_phrases = ((df_phrases.loc[df_phrases[\"total\"] > 40])).reset_index(drop=True)\n",
-    "    common_phrases_list = df_phrases.phrases.tolist()\n",
-    "\n",
-    "    phrases_to_del = [\n",
-    "        \"san bernardino\",\n",
-    "        \"los angeles\",\n",
-    "        \"contra costa\",\n",
-    "        \"el dorado\",\n",
-    "        \"san luis obispo\",\n",
-    "        \"luis obispo\",\n",
-    "        \"del norte\",\n",
-    "        \"san francisco\",\n",
-    "        \"improve approximately\",\n",
-    "    ]\n",
-    "\n",
-    "    common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))\n",
-    "\n",
-    "    # CLean up the list to delete county information/etc\n",
-    "    words_to_delete = [\n",
-    "        \"county\",\n",
-    "        \"route\",\n",
-    "        \"dollar\",\n",
-    "        \"mile\",\n",
-    "        \"santa\",\n",
-    "        \"project\",\n",
-    "        \"san\",\n",
-    "        \"lanes\",\n",
-    "        \"lane\",\n",
-    "        \"2\",\n",
-    "        \"4\",\n",
-    "        \"financial\",\n",
-    "        \"prop\",\n",
-    "        \"best\",\n",
-    "        \"approximately\",\n",
-    "    ]\n",
-    "\n",
-    "    for word in words_to_delete:\n",
-    "        common_phrases_list = [x for x in common_phrases_list if word not in x]\n",
-    "\n",
-    "    # ADD certain keywords here\n",
-    "    # Operating Additional Service\n",
-    "    common_phrases_list.extend(values_to_add)\n",
-    "\n",
-    "    return common_phrases_list"
+    "lp2000_project, lp2000_county, lp2000_district, lp2000_award, lp2000_phase = load_lp2000(\"LP2000_projects.xlsx\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 45,
-   "id": "ec139873-4bb7-4428-9fd7-ceb9e247d4a3",
+   "execution_count": 63,
+   "id": "f7947161-a519-4342-8672-edbdba742984",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def categorize_projects(\n",
-    "    df,\n",
-    "    description_column: str,\n",
-    "    project_id_column: str,\n",
-    "    title_column: str,\n",
-    "    values_to_add: list,\n",
-    "):\n",
-    "\n",
-    "    # Find most common 2 word phrases for some automatic project categories\n",
-    "    common_phrases_list = find_common_phrases(df, description_column, values_to_add)\n",
-    "\n",
-    "    # Place all the words in common_phrases_list into a blob named query\n",
-    "    # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa\n",
-    "    query = \"|\".join(common_phrases_list)\n",
-    "\n",
-    "    # Remove punctation and lower strings in original description column befores searching\n",
-    "    df[\"clean_description\"] = (\n",
-    "        df[description_column]\n",
-    "        .str.lower()\n",
-    "        .str.replace(\"-\", \" \", regex=True)\n",
-    "        .str.replace(\"(\", \" \", regex=True)\n",
-    "        .str.replace(\")\", \" \", regex=True)\n",
-    "        .str.replace(\".\", \" \", regex=True)\n",
-    "        .str.strip()\n",
-    "    )\n",
-    "\n",
-    "    # Search through description column for the most common phrases\n",
-    "    # Input the results in the new column\n",
-    "    df[\"auto_project_category\"] = df[\"clean_description\"].str.findall(\n",
-    "        r\"\\b({})\\b\".format(query)\n",
-    "    )\n",
-    "\n",
-    "    # Explode to take categories out of a list\n",
-    "    # Drop duplicate project keywords by title\n",
-    "    df = (\n",
-    "        df.explode(\"auto_project_category\")\n",
-    "        .sort_values([project_id_column, title_column])\n",
-    "        .drop_duplicates(\n",
-    "            subset=[\n",
-    "                description_column,\n",
-    "                project_id_column,\n",
-    "                title_column,\n",
-    "                \"auto_project_category\",\n",
-    "            ]\n",
-    "        )\n",
-    "    )\n",
-    "\n",
-    "    # Fill any uncategorized projects as \"Other\"\n",
-    "    df[\"auto_project_category\"] = (\n",
-    "        df[\"auto_project_category\"].fillna(\"Other\").str.title()\n",
-    "    )\n",
-    "\n",
-    "    # Correct spelling\n",
-    "    spell = Speller(lang=\"en\")\n",
-    "    df[\"auto_project_category\"] = df[\"auto_project_category\"].apply(\n",
-    "        lambda x: \" \".join([spell(i) for i in x.split()])\n",
-    "    )\n",
-    "\n",
-    "    # Summarize - put all the categories onto one line\n",
-    "    df = (\n",
-    "        df.groupby(\n",
-    "            [\n",
-    "                description_column,\n",
-    "                project_id_column,\n",
-    "                title_column,\n",
-    "            ]\n",
-    "        )[\"auto_project_category\"]\n",
-    "        .apply(\",\".join)\n",
-    "        .reset_index()\n",
-    "    )\n",
-    "\n",
-    "    return df"
+    "# lp2000_project = unique_project_number(lp2000_project)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 46,
-   "id": "d123f3b9-da23-4d4d-a2e2-dc3769100171",
+   "execution_count": 64,
+   "id": "4e674bed-ebd1-43eb-a944-721e30f22cfb",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(11272, 11272)"
+      ]
+     },
+     "execution_count": 64,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "def add_all_projects2():\n",
-    "\n",
-    "    # Load  dataframes\n",
-    "    state_rail_plan = harmonize_srp()\n",
-    "    lost = harominze_lost()\n",
-    "    sb1 = harmonize_sb1()\n",
-    "\n",
-    "    # Concat for df\n",
-    "    all_projects_df = pd.concat([lost, state_rail_plan, sb1])\n",
-    "\n",
-    "    # Categorize\n",
-    "    categories = categorize_projects(\n",
-    "        all_projects_df,\n",
-    "        \"project_description\",\n",
-    "        \"project_title\",\n",
-    "        \"project_id\",\n",
-    "        [\n",
-    "            \"operating\",\n",
-    "            \"service\",\n",
-    "            \"zero emission vehicle\",\n",
-    "            \"zev\",\n",
-    "            \"maintain/repair\",\n",
-    "            \"repair/replace\",\n",
-    "        ],\n",
-    "    )\n",
-    "\n",
-    "    # Merge categorized\n",
-    "    all_projects_df = pd.merge(\n",
-    "        all_projects_df.drop(columns=[\"clean_description\"]),\n",
-    "        categories,\n",
-    "        how=\"left\",\n",
-    "        on=[\"project_description\", \"project_title\", \"project_id\"],\n",
-    "    )\n",
-    "\n",
-    "    # Rename\n",
-    "    all_projects_df = all_projects_df.drop(columns=[\"auto_project_category_x\"]).rename(\n",
-    "        columns={\"auto_project_category_y\": \"auto_tagged_project_categories\"}\n",
-    "    )\n",
-    "    # Concat for gdf\n",
-    "    all_projects_gdf = pd.concat([sb1])\n",
-    "    all_projects_gdf = all_projects_gdf.set_geometry(\"location\")\n",
-    "\n",
-    "    return all_projects_df, all_projects_gdf"
+    "len(lp2000_project), lp2000_project.project_number.nunique()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 47,
-   "id": "87a29e05-0ba6-40cb-93e2-d097159e6235",
+   "execution_count": 75,
+   "id": "219ec508-8706-49f0-ae04-711599d967a2",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# all_projects, all_projects_geo = add_all_projects()"
+    "def add_project_number(df_with_project_name:pd.DataFrame, right_project:pd.DataFrame, merge_col:str)-> pd.DataFrame:\n",
+    "    m1 = pd.merge(df_with_project_name, right_project, on = merge_col, how = 'inner')\n",
+    "    return m1"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 48,
-   "id": "873a88ca-5a47-4bfe-a1d3-715a5bed05bb",
-   "metadata": {
-    "scrolled": true,
-    "tags": []
-   },
+   "execution_count": 71,
+   "id": "ad4f5fbd-918e-4c7d-9f11-74060ca6b9a2",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)"
+    "lp2000_project_subset = lp2000_project[['project_number', 'project_id']]"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "85cfedf8-14aa-4d6c-b30e-cc9f6ee5bbf8",
+   "cell_type": "code",
+   "execution_count": 72,
+   "id": "64e0b2f1-2214-4822-b5d9-6c27a9be79ee",
    "metadata": {},
+   "outputs": [],
    "source": [
-    "### Look at the data"
+    "lp2000_county_df = pd.merge(lp2000_project_subset, lp2000_county, on = 'project_id', how = 'inner')"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 51,
-   "id": "a7e39b78-af8b-4bc5-8911-572839a72b36",
-   "metadata": {
-    "tags": []
-   },
+   "execution_count": 73,
+   "id": "633651e5-f043-4a8a-a4b4-9e5d42ac3803",
+   "metadata": {},
    "outputs": [
     {
-     "ename": "KeyError",
-     "evalue": "\"Column(s) ['project_id'] do not exist\"",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mKeyError\u001b[0m                                  Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[51], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mall_projects\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlead_agency\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mproject_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnunique\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39msort_values(\n\u001b[1;32m      2\u001b[0m     \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproject_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, ascending\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m      3\u001b[0m )\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m10\u001b[39m)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/groupby/generic.py:895\u001b[0m, in \u001b[0;36mDataFrameGroupBy.aggregate\u001b[0;34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[0m\n\u001b[1;32m    892\u001b[0m func \u001b[38;5;241m=\u001b[39m maybe_mangle_lambdas(func)\n\u001b[1;32m    894\u001b[0m op \u001b[38;5;241m=\u001b[39m GroupByApply(\u001b[38;5;28mself\u001b[39m, func, args, kwargs)\n\u001b[0;32m--> 895\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    896\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dict_like(func) \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m    897\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m result\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:172\u001b[0m, in \u001b[0;36mApply.agg\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    169\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_str()\n\u001b[1;32m    171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_dict_like(arg):\n\u001b[0;32m--> 172\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    173\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(arg):\n\u001b[1;32m    174\u001b[0m     \u001b[38;5;66;03m# we require a list, but not a 'str'\u001b[39;00m\n\u001b[1;32m    175\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39magg_list_like()\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:496\u001b[0m, in \u001b[0;36mApply.agg_dict_like\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m    493\u001b[0m     selected_obj \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_selected_obj\n\u001b[1;32m    494\u001b[0m     selection \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_selection\n\u001b[0;32m--> 496\u001b[0m arg \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnormalize_dictlike_arg\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43magg\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m selected_obj\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m    499\u001b[0m     \u001b[38;5;66;03m# key only used for output\u001b[39;00m\n\u001b[1;32m    500\u001b[0m     colg \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_gotitem(selection, ndim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n",
-      "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:619\u001b[0m, in \u001b[0;36mApply.normalize_dictlike_arg\u001b[0;34m(self, how, obj, func)\u001b[0m\n\u001b[1;32m    617\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(cols) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m    618\u001b[0m         cols_sorted \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(safe_sort(\u001b[38;5;28mlist\u001b[39m(cols)))\n\u001b[0;32m--> 619\u001b[0m         \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcols_sorted\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m do not exist\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m    621\u001b[0m aggregator_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mdict\u001b[39m)\n\u001b[1;32m    623\u001b[0m \u001b[38;5;66;03m# if we have a dict of any non-scalars\u001b[39;00m\n\u001b[1;32m    624\u001b[0m \u001b[38;5;66;03m# eg. {'A' : ['mean']}, normalize all to\u001b[39;00m\n\u001b[1;32m    625\u001b[0m \u001b[38;5;66;03m# be list-likes\u001b[39;00m\n\u001b[1;32m    626\u001b[0m \u001b[38;5;66;03m# Cannot use func.values() because arg may be a Series\u001b[39;00m\n",
-      "\u001b[0;31mKeyError\u001b[0m: \"Column(s) ['project_id'] do not exist\""
-     ]
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_number</th>\n",
+       "      <th>project_id</th>\n",
+       "      <th>county_name</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>4377</th>\n",
+       "      <td>0b952b66e020</td>\n",
+       "      <td>5202(007)</td>\n",
+       "      <td>Los Angeles County</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     project_number project_id         county_name\n",
+       "4377   0b952b66e020  5202(007)  Los Angeles County"
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "all_projects.groupby([\"lead_agency\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n",
-    "    \"project_id\", ascending=False\n",
-    ").head(10)"
+    "lp2000_county_df.sample()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "daa0b1d3-4416-4537-b568-bdaae9fd1fdb",
+   "execution_count": 74,
+   "id": "d45e35b7-604a-4103-ad2c-8e8ba3d8946a",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_id</th>\n",
+       "      <th>comment_desc</th>\n",
+       "      <th>est_total_prj_costs</th>\n",
+       "      <th>location_name</th>\n",
+       "      <th>project_label_name</th>\n",
+       "      <th>original_post_mile_begin_id</th>\n",
+       "      <th>original_post_mile_end_id</th>\n",
+       "      <th>revised_post_mile_begin_ind</th>\n",
+       "      <th>revised_post_mile_end_ind</th>\n",
+       "      <th>route_name</th>\n",
+       "      <th>state_hwy_ind</th>\n",
+       "      <th>senate_district_code</th>\n",
+       "      <th>update_date_time</th>\n",
+       "      <th>agency_name</th>\n",
+       "      <th>urban_area_code</th>\n",
+       "      <th>work_type_desc</th>\n",
+       "      <th>category_desc</th>\n",
+       "      <th>current_phase</th>\n",
+       "      <th>active_transportation_program__atp_</th>\n",
+       "      <th>bridge_inspection___scour_evaluation</th>\n",
+       "      <th>covid_relief_funds_for_highway_infrastructure_programs_for_stip_covid_augmentation</th>\n",
+       "      <th>carbon_reduction_program__crp_</th>\n",
+       "      <th>congestion_mitigation___air_quality_improvement_program__cmaq_</th>\n",
+       "      <th>coronavirus_response_and_relief_supplemental_appropriations_act__crrsaa__funds</th>\n",
+       "      <th>corridor_mobility_improvement_account__cmia__program</th>\n",
+       "      <th>county_exchange_funds</th>\n",
+       "      <th>county_state_match_program</th>\n",
+       "      <th>earmarks_projects__hpp,_demo_cpfcds,_etc__</th>\n",
+       "      <th>emergency_relief__er_</th>\n",
+       "      <th>ferry_boat_program__fbp__and_ferry_boat_discretionary__fbd__program</th>\n",
+       "      <th>funds_for_planning,_programming_and_monitoring___rip</th>\n",
+       "      <th>general_funded_designated_programs</th>\n",
+       "      <th>hazard_elimination_safety__hes_</th>\n",
+       "      <th>high_risk_rural_roads_program__hr3_</th>\n",
+       "      <th>highway_bridge_</th>\n",
+       "      <th>highway_safety_improvement_program__hsip___infrastructure__state_fund</th>\n",
+       "      <th>highway_safety_improvement_program__hsip___non_infrastructure_</th>\n",
+       "      <th>highway_safety_improvement_program__hsip__infrastructure__federal_fund</th>\n",
+       "      <th>local_partnership_program__lpp_–_competitive__</th>\n",
+       "      <th>local_roads</th>\n",
+       "      <th>local_roads_rehabilitation</th>\n",
+       "      <th>railroad_grade_crossing_protection</th>\n",
+       "      <th>railroad_grade_separations</th>\n",
+       "      <th>rebuilding_american_infrastructure_with_sustainability_and_equity__raise__and_multimodal_project_discretionary_grant_programs__e_g_,_infra,_mega,_rstg_or_rural__</th>\n",
+       "      <th>regional_improvement_program_–_regional_share_of_stip_transportation_enhancement__off_system_</th>\n",
+       "      <th>regional_surface_transportation_block_grant_program__rstbgp__and_highway_infrastructure_program__hip_</th>\n",
+       "      <th>regional_transportation_planning_agency__rtpa__stp_match_exchange</th>\n",
+       "      <th>sb1_funded_freeway_service_patrol</th>\n",
+       "      <th>shopp__traffic_light_synchronization_program__tlsp___proposition_1b_bond_funds</th>\n",
+       "      <th>safe_routes_to_school__sr2s_and_srts_</th>\n",
+       "      <th>set_aside_coordinated_border_infrastructure__cbi__program_under_fast_act</th>\n",
+       "      <th>solutions_for_congested_corridors_program__sccp_</th>\n",
+       "      <th>special_programs</th>\n",
+       "      <th>state_local_partnership_program__slpp__and_local_partnership_program__lpp_formulaic_</th>\n",
+       "      <th>structures_seismic_retrofit_</th>\n",
+       "      <th>trade_corridor_enhancement_account__tcea__programs_–_local_share</th>\n",
+       "      <th>trade_corridor_enhancement_account__tcea__programs_–_state_share</th>\n",
+       "      <th>trade_corridors_improvement_fund__tcif__program_local_streets___roads</th>\n",
+       "      <th>traffic_congestion_relief_program___tcrp__</th>\n",
+       "      <th>project_number</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>4377</th>\n",
+       "      <td>5202(007)</td>\n",
+       "      <td>file has been transferred to Saad issa on 12/14/05. Mike Benyamin\\n\\nInactive- ms</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Various Locations Citywide</td>\n",
+       "      <td>Traffic Sign Upgrades</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>0-SFR</td>\n",
+       "      <td>N</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>2010-08-09 20:04:27</td>\n",
+       "      <td>San Fernando</td>\n",
+       "      <td>3041.00</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Signs</td>\n",
+       "      <td>single phase</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>Unknown</td>\n",
+       "      <td>0b952b66e020</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "     project_id  \\\n",
+       "4377  5202(007)   \n",
+       "\n",
+       "                                                                           comment_desc  \\\n",
+       "4377  file has been transferred to Saad issa on 12/14/05. Mike Benyamin\\n\\nInactive- ms   \n",
+       "\n",
+       "      est_total_prj_costs               location_name     project_label_name  \\\n",
+       "4377                  NaN  Various Locations Citywide  Traffic Sign Upgrades   \n",
+       "\n",
+       "      original_post_mile_begin_id  original_post_mile_end_id  \\\n",
+       "4377                          NaN                        NaN   \n",
+       "\n",
+       "     revised_post_mile_begin_ind revised_post_mile_end_ind route_name  \\\n",
+       "4377                         NaN                       NaN      0-SFR   \n",
+       "\n",
+       "     state_hwy_ind  senate_district_code    update_date_time   agency_name  \\\n",
+       "4377             N                   NaN 2010-08-09 20:04:27  San Fernando   \n",
+       "\n",
+       "      urban_area_code work_type_desc category_desc current_phase  \\\n",
+       "4377          3041.00            NaN         Signs  single phase   \n",
+       "\n",
+       "     active_transportation_program__atp_ bridge_inspection___scour_evaluation  \\\n",
+       "4377                             Unknown                              Unknown   \n",
+       "\n",
+       "     covid_relief_funds_for_highway_infrastructure_programs_for_stip_covid_augmentation  \\\n",
+       "4377                                                                            Unknown   \n",
+       "\n",
+       "     carbon_reduction_program__crp_  \\\n",
+       "4377                        Unknown   \n",
+       "\n",
+       "     congestion_mitigation___air_quality_improvement_program__cmaq_  \\\n",
+       "4377                                                        Unknown   \n",
+       "\n",
+       "     coronavirus_response_and_relief_supplemental_appropriations_act__crrsaa__funds  \\\n",
+       "4377                                                                        Unknown   \n",
+       "\n",
+       "     corridor_mobility_improvement_account__cmia__program  \\\n",
+       "4377                                              Unknown   \n",
+       "\n",
+       "     county_exchange_funds county_state_match_program  \\\n",
+       "4377               Unknown                    Unknown   \n",
+       "\n",
+       "     earmarks_projects__hpp,_demo_cpfcds,_etc__ emergency_relief__er_  \\\n",
+       "4377                                    Unknown               Unknown   \n",
+       "\n",
+       "     ferry_boat_program__fbp__and_ferry_boat_discretionary__fbd__program  \\\n",
+       "4377                                                             Unknown   \n",
+       "\n",
+       "     funds_for_planning,_programming_and_monitoring___rip  \\\n",
+       "4377                                              Unknown   \n",
+       "\n",
+       "     general_funded_designated_programs hazard_elimination_safety__hes_  \\\n",
+       "4377                            Unknown                         Unknown   \n",
+       "\n",
+       "     high_risk_rural_roads_program__hr3_ highway_bridge_  \\\n",
+       "4377                             Unknown         Unknown   \n",
+       "\n",
+       "     highway_safety_improvement_program__hsip___infrastructure__state_fund  \\\n",
+       "4377                                                               Unknown   \n",
+       "\n",
+       "     highway_safety_improvement_program__hsip___non_infrastructure_  \\\n",
+       "4377                                                        Unknown   \n",
+       "\n",
+       "     highway_safety_improvement_program__hsip__infrastructure__federal_fund  \\\n",
+       "4377                                                                Unknown   \n",
+       "\n",
+       "     local_partnership_program__lpp_–_competitive__ local_roads  \\\n",
+       "4377                                        Unknown     Unknown   \n",
+       "\n",
+       "     local_roads_rehabilitation railroad_grade_crossing_protection  \\\n",
+       "4377                    Unknown                            Unknown   \n",
+       "\n",
+       "     railroad_grade_separations  \\\n",
+       "4377                    Unknown   \n",
+       "\n",
+       "     rebuilding_american_infrastructure_with_sustainability_and_equity__raise__and_multimodal_project_discretionary_grant_programs__e_g_,_infra,_mega,_rstg_or_rural__  \\\n",
+       "4377                                                                                                                                                           Unknown   \n",
+       "\n",
+       "     regional_improvement_program_–_regional_share_of_stip_transportation_enhancement__off_system_  \\\n",
+       "4377                                                                                       Unknown   \n",
+       "\n",
+       "     regional_surface_transportation_block_grant_program__rstbgp__and_highway_infrastructure_program__hip_  \\\n",
+       "4377                                                                                               Unknown   \n",
+       "\n",
+       "     regional_transportation_planning_agency__rtpa__stp_match_exchange  \\\n",
+       "4377                                                           Unknown   \n",
+       "\n",
+       "     sb1_funded_freeway_service_patrol  \\\n",
+       "4377                           Unknown   \n",
+       "\n",
+       "     shopp__traffic_light_synchronization_program__tlsp___proposition_1b_bond_funds  \\\n",
+       "4377                                                                        Unknown   \n",
+       "\n",
+       "     safe_routes_to_school__sr2s_and_srts_  \\\n",
+       "4377                               Unknown   \n",
+       "\n",
+       "     set_aside_coordinated_border_infrastructure__cbi__program_under_fast_act  \\\n",
+       "4377                                                                  Unknown   \n",
+       "\n",
+       "     solutions_for_congested_corridors_program__sccp_ special_programs  \\\n",
+       "4377                                          Unknown          Unknown   \n",
+       "\n",
+       "     state_local_partnership_program__slpp__and_local_partnership_program__lpp_formulaic_  \\\n",
+       "4377                                                                              Unknown   \n",
+       "\n",
+       "     structures_seismic_retrofit_  \\\n",
+       "4377                      Unknown   \n",
+       "\n",
+       "     trade_corridor_enhancement_account__tcea__programs_–_local_share  \\\n",
+       "4377                                                          Unknown   \n",
+       "\n",
+       "     trade_corridor_enhancement_account__tcea__programs_–_state_share  \\\n",
+       "4377                                                          Unknown   \n",
+       "\n",
+       "     trade_corridors_improvement_fund__tcif__program_local_streets___roads  \\\n",
+       "4377                                                               Unknown   \n",
+       "\n",
+       "     traffic_congestion_relief_program___tcrp__ project_number  \n",
+       "4377                                    Unknown   0b952b66e020  "
+      ]
+     },
+     "execution_count": 74,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "all_projects[\n",
-    "    (all_projects.county == \"Kern\")\n",
-    "    & (all_projects.project_description.str.contains(\"Seal Coat\"))\n",
-    "].drop(columns=[\"location\"])"
+    "lp2000_project.loc[lp2000_project.project_id == \"5202(007)\"]"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "dc906308-31d4-4fde-b492-8218b05cec90",
+   "execution_count": 77,
+   "id": "03314dd0-426d-4e29-a6cf-6450757356ae",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(11272, 11272)"
+      ]
+     },
+     "execution_count": 77,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "# all_projects.groupby(['project_category','auto_tagged_project_categories']).agg({'project_id':'nunique'})"
+    "len(lp2000_project), lp2000_project.project_id.nunique()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "d2d6ac3a-c517-4df2-b907-0bac0a09e34a",
+   "execution_count": 81,
+   "id": "5380e43e-d74d-44a9-a1df-ffe1f916aefd",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "11263"
+      ]
+     },
+     "execution_count": 81,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "all_projects.groupby([\"auto_tagged_project_categories\"]).agg(\n",
-    "    {\"project_id\": \"nunique\"}\n",
-    ").sort_values(\"project_id\", ascending=False).head(10)"
+    "len(lp2000_project.drop(columns = ['project_id']).drop_duplicates())"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "5150da00-2a30-4f4d-bec8-1d9e5c66d623",
+   "execution_count": 82,
+   "id": "c0725be7-0bad-4e3e-b664-4db1ade8b3c5",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "9"
+      ]
+     },
+     "execution_count": 82,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
    "source": [
-    "all_projects.groupby([\"project_category\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n",
-    "    \"project_id\", ascending=False\n",
-    ").head(10)"
+    "11272-11263"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1bf38631-a734-47b0-9465-fcfb8ebafcad",
+   "cell_type": "markdown",
+   "id": "fd3d068b-b95f-494d-bbbd-e0605c68f616",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "all_projects.groupby([\"project_description\"]).agg(\n",
-    "    {\"project_id\": \"nunique\"}\n",
-    ").sort_values(\"project_id\", ascending=False).head(10)"
+    "### CTIPS"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5c1baa16-e15c-48e7-9772-ef67755f9d21",
+   "id": "b8f5a6af-db4a-4d1a-9bcc-7e8324b61947",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects.groupby([\"county\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n",
-    "    \"project_id\", ascending=False\n",
-    ").head(10)"
+    "def load_ctips(file: str):\n",
+    "  \n",
+    "    df_project = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"project\")\n",
+    "    )\n",
+    "\n",
+    "    df_county = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"county\")\n",
+    "    )\n",
+    "\n",
+    "    df_district = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"district\")\n",
+    "    )\n",
+    "\n",
+    "    df_phase = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"phase_funding\")\n",
+    "    )\n",
+    "    \n",
+    "    df_award = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"awards\")\n",
+    "    )\n",
+    "    \n",
+    "    df_house = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"us_house\")\n",
+    "    )\n",
+    "    \n",
+    "    df_senate = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"senate\")\n",
+    "    )\n",
+    "    \n",
+    "    df_assembly = to_snakecase(\n",
+    "        pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"assembly\")\n",
+    "    )\n",
+    "    return df_project, df_county, df_district, df_phase, df_award, df_house, df_senate, df_assembly"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "9d55e4ed-9b69-4111-b2ed-69715c9d90c5",
+   "id": "9372c9cb-a54a-4a2d-8f64-ca919f6b7b75",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects.lead_agency.nunique()"
+    "ctips_project, ctips_county, ctips_district, ctips_phase, ctips_award, ctips_house, ctips_senate, ctips_assembly = load_ctips('CTIPS.xlsx')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "01a534d9-75e4-4ff8-aa11-99db480de733",
+   "id": "1a53792b-c209-4303-a14f-59ea9ef03c9c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects.total_project_cost.describe()"
+    "ctips_project.sample()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4940bb3c-6170-4e12-a8ff-c4e97d7dbff2",
+   "metadata": {},
+   "source": [
+    "### State Rail Plan"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "6985e5d0-cf27-423f-8775-16eb3c518beb",
-   "metadata": {
-    "tags": []
-   },
+   "id": "8a61f896-808c-44cc-ae3b-5c651bcee78e",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects.loc[all_projects.fully_funded == \"Fully funded\"].groupby(\n",
-    "    [\"data_source\"]\n",
-    ").agg({\"project_id\": \"nunique\"})"
+    "srp_df = har_utils.load_state_rail_plan()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "3259fc95-2db6-46ad-8cc6-a0357aa19077",
+   "id": "5469cc55-0034-469e-b199-991ca7ada378",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects.loc[all_projects.fully_funded == \"Partially funded\"].groupby(\n",
-    "    [\"data_source\"]\n",
-    ").agg({\"project_id\": \"nunique\"})"
+    "srp_df = generate_alphanumeric_ids(srp_df, 8)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "2ef08825-9e29-4268-9172-d0d83e08243b",
+   "id": "95a7939f-4021-4b00-b036-07d5fed90a4a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects.groupby([\"data_source\"]).agg({\"project_id\": \"nunique\"})"
+    "srp_df.sample()"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "5fae701e-4132-4d06-8c27-3e598e072172",
+   "id": "b2bfef02-8d49-4a62-b49b-3807610f9fe4",
    "metadata": {},
    "outputs": [],
    "source": [
-    "all_projects.groupby([\"fully_funded\"]).agg(\n",
-    "    {\"project_id\": \"nunique\"}\n",
-    ").reset_index().sort_values(\"project_id\", ascending=False)"
+    "srp_df_agency = separate_out_df(srp_df, ['project_number', 'lead_agency'])"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "171611d6-acf9-46d8-9814-20534114d43e",
+   "id": "98052546-11e9-4e94-8cc1-0e334e5606f4",
    "metadata": {},
    "outputs": [],
-   "source": [
-    "all_projects.groupby([\"data_source\", \"fully_funded\"]).agg({\"project_id\": \"nunique\"})"
-   ]
+   "source": []
   }
  ],
  "metadata": {
diff --git a/project_list/ctips_01_18_2024.ipynb b/project_list/ctips_01_18_2024.ipynb
deleted file mode 100644
index f7f4cb735..000000000
--- a/project_list/ctips_01_18_2024.ipynb
+++ /dev/null
@@ -1,1422 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "id": "1b222a19",
-   "metadata": {},
-   "source": [
-    "## CTIPS\n",
-    "* https://ctips-prod.dot.ca.gov/ctips/LoginMediatorForm.do\n",
-    "\n",
-    "### To do\n",
-    "* Ask if DSHOPP means draft SHOPP project\n",
-    "* PROJSCHE - not a lot of matches"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "75094621",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import pandas as pd \n",
-    "import sqlalchemy \n",
-    "import sys \n",
-    "import re\n",
-    "import oracledb "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "94838472",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "oracledb.version = \"8.3.0\" \n",
-    "sys.modules[\"cx_Oracle\"] = oracledb "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a4a53471",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.options.display.max_columns = 100\n",
-    "pd.options.display.float_format = \"{:.2f}\".format\n",
-    "pd.set_option(\"display.max_rows\", None)\n",
-    "pd.set_option(\"display.max_colwidth\", None)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "90056c61",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "\n",
-    "ENGINE_PATH_WIN_AUTH =  f\"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}\" "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ee70eded",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH)   "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "6665d753",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def to_snakecase(df):\n",
-    "    df.columns = df.columns.str.lower().str.replace(' ','_')\n",
-    "    return df"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b429d5f5",
-   "metadata": {},
-   "source": [
-    "### Project\n",
-    "Project.agencyid = project sponsor\n",
-    "\n",
-    "Implpaed = Implementing Agency for PA&ED\n",
-    "\n",
-    "Implpse = Implementing Agency for PS&E\n",
-    "\n",
-    "implcon = Implementing Agency for Construction\n",
-    "\n",
-    "implrw = Implementing Agency for Right of Way\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "697a0653",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projects_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "appdate, \n",
-    "archive,\n",
-    "agencyid,\n",
-    "bond99,\n",
-    "cmia,\n",
-    "ctips_id,\n",
-    "const_date,\n",
-    "countyid,\n",
-    "countyid2,\n",
-    "countyid3,\n",
-    "chg_offcl,\n",
-    "chg_qual1,\n",
-    "chg_qual2,\n",
-    "districtid,\n",
-    "document,\n",
-    "docyear,\n",
-    "ea_number,\n",
-    "high_ver,\n",
-    "high_offcl,\n",
-    "implpaed, \n",
-    "implpse, \n",
-    "implrw, \n",
-    "implcon, \n",
-    "lupdate, \n",
-    "needpurpose,\n",
-    "progcode1,\n",
-    "ppno,\n",
-    "proj_desc,\n",
-    "postmiles1,\n",
-    "pm1b,\n",
-    "pm2b,\n",
-    "pm3b,\n",
-    "pm1a,\n",
-    "pm2a,\n",
-    "pm3a,\n",
-    "projcomp_date,\n",
-    "projectid,\n",
-    "route1,\n",
-    "route2,\n",
-    "route3,\n",
-    "rtl,\n",
-    "stip,\n",
-    "shopp,\n",
-    "title,\n",
-    "tcif,\n",
-    "tcrpno,\n",
-    "tcrp,\n",
-    "urbanid,\n",
-    "version\n",
-    "FROM ctips.project\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "485deb02",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projects_df.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "56598a58",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projects_df.projectid.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "db0c7bb6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projects_df.ctips_id.nunique()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "a1f4609c",
-   "metadata": {},
-   "source": [
-    "### A bit of cleaning"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1295f8de",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projects_df = projects_df.fillna(projects_df.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64': 0}))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cd3800ae",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "string_cols = [col for col in projects_df.columns if projects_df[col].dtype == 'object']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "51ef6caf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "string_cols = [\n",
-    " 'needpurpose',\n",
-    " 'proj_desc',\n",
-    " 'route1',\n",
-    " 'title']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8c069d0d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "for i in string_cols:\n",
-    "        projects_df[i] = projects_df[i].str.title().str.lstrip().str.rstrip()\n",
-    "        projects_df[i] = projects_df[i].replace(r'\\s+', ' ', regex=True)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "3a47796d",
-   "metadata": {},
-   "source": [
-    "### 1 row = 1 project \n",
-    "* Some projects don't have a high version?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "665641c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projects_df2 = projects_df.sort_values(by = ['high_offcl', 'high_ver','archive'], ascending = [False, False, False])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a800db0c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Drop projects by ctips_id\n",
-    "projects_df3 = projects_df2.drop_duplicates(subset = ['ctips_id'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d1e5c25e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Filter out projects that are finished\n",
-    "projects_df3 = projects_df3.loc[projects_df3.archive == 0]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "83d42e8a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projects_df3.ctips_id.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3cee0056",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# Filter out any rows where chg_qual1==7 because those are projects that are deleted\n",
-    "projects_df3 = projects_df3[projects_df3.chg_qual1 != 7]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "afe30974",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(projects_df3)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "9be1132f",
-   "metadata": {},
-   "source": [
-    "#### Ask if DSHOPP means draft shopp?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1de0e049",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projects_df3.document.unique()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "190d2323",
-   "metadata": {},
-   "source": [
-    "### PROJSCHE"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2ea89922",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projsche_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "projectid,\n",
-    "m020 AS pa_ed_begin,\n",
-    "m200a AS pa_ed_end,\n",
-    "m200b AS ps_e_begin,\n",
-    "m224 AS begin_row,\n",
-    "m410 AS end_row,\n",
-    "m500 AS con_start_date,\n",
-    "m600 AS con_end_date,\n",
-    "m700 AS begin_closeout,\n",
-    "m800 AS end_closeout\n",
-    "FROM ctips.projsche\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ff9fd013",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projsche_df.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9d335f70",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projsche_df.info()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1a9ee8aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projsche_drop_cols = list(projsche_df.columns)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d6a08152",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projsche_drop_cols.remove('projectid')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cc2beec3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# I want to drop the rows in which ALL values in the date columns are empty\n",
-    "projsche_df2 = projsche_df.dropna(how = \"all\", subset = projsche_drop_cols).reset_index(drop = True)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bb3109a9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(projsche_df2), len(projsche_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4ab6a530",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projsche_df2.projectid.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e4c93886",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "projsche_df2.info()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b36bd39b",
-   "metadata": {},
-   "source": [
-    "#### Not a lot of matching values"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b323c658",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.merge(projsche_df2, projects_df3, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "724cb2bf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.merge(projsche_df, projects_df3, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a590548b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "m1 = pd.merge(projects_df3, projsche_df2,  on ='projectid', how = 'left')"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "afdceff8",
-   "metadata": {},
-   "source": [
-    "### AGENCY"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f3e0d05b",
-   "metadata": {
-    "scrolled": true
-   },
-   "outputs": [],
-   "source": [
-    "agency_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "name AS agency_name,\n",
-    "agencyid\n",
-    "FROM ctips.agncy\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "be3c31ba",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.merge(m1, agency_df, on ='agencyid', how = 'outer', indicator = True)[['_merge']].value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1fd05b1a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "m2 = pd.merge(m1, agency_df,  on ='agencyid', how = 'left')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "edd6dcd3",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "agency_cols = ['agencyid', 'agency_name', 'implpaed', 'implpse', 'implrw', 'implcon']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "4c67a81d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "m2.loc[m2.implpaed != \"None\"][agency_cols].sample()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0ceb0a3b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "m2.loc[m2.implrw != \"None\"][agency_cols].head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "474ae25f",
-   "metadata": {},
-   "source": [
-    "### COUNTY"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "cfb16bcf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "county_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "name AS county_name,\n",
-    "countyid\n",
-    "FROM ctips.county\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a04fc040",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "m3 = pd.merge(m2, county_df,  on ='countyid', how = 'left')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a8a21f03",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "m3.sample()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b53cb205",
-   "metadata": {},
-   "source": [
-    "### FUNDLINE\n",
-    "* For action: Action: P = programmed, V= vote, A=award"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b6b5c018",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundline_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "action,\n",
-    "con,\n",
-    "rw,\n",
-    "pe_paed,\n",
-    "pe_env,\n",
-    "pe_rw,\n",
-    "pe_con,\n",
-    "pe_total,\n",
-    "fundlineid,\n",
-    "fundtypeid,\n",
-    "line_year,\n",
-    "actiondate\n",
-    "FROM ctips.fundline\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "60fdfd34",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundline_df.fundlineid.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "db97ac33",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundline_df.fundlineid.value_counts().head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "276178ce",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundline_df.fundtypeid.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "89337dbb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundline_df.fundtypeid.value_counts().sample(5)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "904088cb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(fundline_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5045315e",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundline_df.action.value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "818684a9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundline_df.loc[fundline_df.fundtypeid == 20700009194]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "e2407ba7",
-   "metadata": {},
-   "source": [
-    "### Fundtype\n",
-    "* Fundtype.agencyid = funding agency"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f4efe574",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "fundtypeid,\n",
-    "fundid,\n",
-    "progcode,\n",
-    "programid,\n",
-    "projectid,\n",
-    "agencyid\n",
-    "FROM ctips.fundtype\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "3efede49",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_df.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a3633eac",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_df.fundid.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "425a034a",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_df.projectid.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "048155d8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_df.fundtypeid.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b80d227f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_df.fundtypeid.value_counts().head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6784b287",
-   "metadata": {},
-   "source": [
-    "### Do the merges\n",
-    "#### Merge fundtype and fundline"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5c042959",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.merge(fundtype_df,\n",
-    "         fundline_df,  \n",
-    "         on = ['fundtypeid'], \n",
-    "         how = \"outer\",\n",
-    "         indicator = True,)[['_merge']].value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "121320f8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fund_m1 = pd.merge(fundtype_df,fundline_df,  on = ['fundtypeid'], how = \"left\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0c52d768",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(fund_m1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "567fa3bf",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fund_m1.head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dfc1128b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fund_m1.projectid.nunique(), fund_m1.fundtypeid.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a03c9f1f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fund_m1.fundtypeid.value_counts().head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f54aa5aa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fund_m1.action.value_counts()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "bd6ec50d",
-   "metadata": {},
-   "source": [
-    "#### Merge subset of project with the merge above"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ae18d23b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "project_preview = ['ctips_id','projectid', 'high_ver', 'high_offcl']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "23be17fd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_m2 = pd.merge(m3[project_preview], fund_m1, on = ['projectid'], how = \"inner\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "71a47b1f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_m2.projectid.value_counts().head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bb4336cd",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_m2.projectid.value_counts().describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "94b9fffe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "project_preview = project_preview + ['title']"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "5be904f1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fundtype_m2.columns"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8fd26495",
-   "metadata": {},
-   "source": [
-    "#### Aggregate"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "02e40aaa",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "columns_to_agg = {**dict.fromkeys(['con', 'rw',\n",
-    "       'pe_paed', 'pe_env', 'pe_rw', 'pe_con', 'pe_total'], 'sum')}\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "e34d14de",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "columns_to_agg"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f527e5f2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_cost = fundtype_m2.groupby(['ctips_id','fundid','progcode','programid']).agg(columns_to_agg).reset_index()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ab78b69c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_cost.sample(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c588a4fc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_cost.con.describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1b3f6c66",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_cost.ctips_id.value_counts().head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "f5db3516",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(total_cost), total_cost.ctips_id.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fbd926f7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_cost.loc[total_cost.ctips_id == 20600003977]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "710e214f",
-   "metadata": {},
-   "source": [
-    "### Progmain"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "9afe9c87",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progmain_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "programid,\n",
-    "category AS program\n",
-    "FROM ctips.progmain\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "fea4fa65",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progmain_df.head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8c5a0735",
-   "metadata": {},
-   "source": [
-    "### Fund"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "df1da657",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "fund_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "fund,\n",
-    "fundid\n",
-    "FROM ctips.fund\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "b8971e64",
-   "metadata": {},
-   "source": [
-    "### Progsub"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "32dda1ed",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progsub_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "progcode,\n",
-    "progdesc\n",
-    "FROM ctips.progsub\n",
-    "\"\"\", engine) "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "eb703a9c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progsub_df.head(1)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "87344fdc",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progsub_df.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dced772d",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progsub_df.progcode.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8d9fe332",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "double_ids = ['20.30.010.820',\n",
-    "             '20.XX.723.000',\n",
-    "            '20.30.010.810',\n",
-    "             '20.XX.720.100',\n",
-    "             '20.30.010.817',\n",
-    "              '20.30.210.200'\n",
-    "             ]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "2b9cb8e6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progsub_df.loc[progsub_df.progcode.isin(double_ids)].sort_values('progcode')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "decf3559",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "progsub_df2 = progsub_df.drop_duplicates(subset = ['progcode'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "bf1cdbbe",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(progsub_df)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1a7e9af2",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(progsub_df2)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "6da3af8c",
-   "metadata": {},
-   "source": [
-    "#### Merge"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a945fa00",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_fin_df = (total_cost.merge(progmain_df, on = ['programid'], how = \"left\")\n",
-    "              .merge(fund_df, on =['fundid'], how = \"left\")\n",
-    "              .merge(progsub_df2, on = ['progcode'], how = 'left'))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "21a2f425",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_fin_df = final_fin_df.drop(columns = ['fundid', 'progcode','programid'])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0289feb1",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_fin_df.sample(3)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "83f92b9f",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_fin_df.projectid.value_counts().head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "355bbeb4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "final_fin_df.projectid.value_counts().head()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "227aa0b4",
-   "metadata": {},
-   "source": [
-    "#### Find Total Cost\n",
-    "##### CLARIFY FTIP projects have `pe_total` value so figure out how to find the ftip projects and sum those up"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "d07114d8",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_cost = final_fin_df.groupby(['ctips_id']).agg(columns_to_agg).reset_index()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "c72d2702",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# pe_test = total_cost.loc[(total_cost.pe_con != 0) & (total_cost.pe_env != 0) & (total_cost.pe_rw != 0) & (total_cost.pe_paed != 0)& (total_cost.pe_total != 0)]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b4f913ca",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_cost['total_cost'] = total_cost.con + total_cost.rw + total_cost.pe_paed + total_cost.pe_env + total_cost.pe_rw + total_cost.pe_con"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "90d7e5c9",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# 6,638,471,000\n",
-    "total_cost['total_cost'].describe()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "74e1fd2c",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "total_cost.sort_values(by = ['total_cost'], ascending = False).head()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "b87393e7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(total_cost), total_cost.ctips_id.nunique()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8679ed80",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "len(m3), m3.ctips_id.nunique()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "45ac21f4",
-   "metadata": {},
-   "source": [
-    "#### Agency name missing?"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "79a85fc6",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "m3.loc[m3.ctips_id == 20600002404]"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "8f89fb4a",
-   "metadata": {},
-   "source": [
-    "### Political"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a57b8486",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "political_df = pd.read_sql_query(\"\"\" \n",
-    "SELECT \n",
-    "assembly01,\n",
-    "ushouse01,\n",
-    "ssenate01,\n",
-    "projectid\n",
-    "FROM ctips.politcal\n",
-    "\"\"\", engine) \n",
-    "# Drop any rows with nulls\n",
-    "political_df = political_df.dropna(how = \"any\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ef3ebf38",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "pd.merge(m3, political_df, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "909827fb",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "m4 = pd.merge(m3, political_df, on ='projectid', how = 'left')"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.9.13"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/project_list/sb125_list.ipynb b/project_list/sb125_list.ipynb
index 8a05dc83b..44ca790c9 100644
--- a/project_list/sb125_list.ipynb
+++ b/project_list/sb125_list.ipynb
@@ -23,7 +23,7 @@
     "import geopandas as gpd\n",
     "import pandas as pd\n",
     "from calitp_data_analysis import utils\n",
-    "from calitp_data_analysis.sql import to_snakecase"
+    "from calitp_data_analysis.sql import to_snakecase\n"
    ]
   },
   {
@@ -46,7 +46,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/sb125/\""
+    "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/sb125/local_transit_list/\""
    ]
   },
   {
@@ -198,7 +198,7 @@
    "outputs": [],
    "source": [
     "def load_srp():\n",
-    "    df = har_utils.load_state_rail_plan()\n",
+    "    df = srp_utils.clean_state_rail_plan(srp_utils.state_rail_plan_file)\n",
     "    df[\"source\"] = \"State Rail Plan\"\n",
     "    df[\"program\"] = \"State Rail Plan\"\n",
     "    df[\"dds_phase\"] = \"Planned\"\n",
@@ -232,6 +232,7 @@
    "outputs": [],
    "source": [
     "def load_sb1():\n",
+    "    # Only includes in progress/incomplete projects\n",
     "    df = sb1_utils.load_sb1()\n",
     "    df[\"source\"] = \"SB1 Feature Server\"\n",
     "    df[\"dds_phase\"] = \"Under Construction\"\n",
@@ -292,131 +293,6 @@
     "sb1_df.projprogram.value_counts()"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 11,
-   "id": "09da2d83-6fa4-42c3-a2c3-5569a6d4ec54",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "Index(['projectid', 'projname', 'projcatcode', 'projcategory', 'projprogcode',\n",
-       "       'projprogram', 'multiprogfunded', 'projstatus', 'description', 'cost',\n",
-       "       'assemblydistrict', 'senatedistrict', 'assemblycode', 'senatecode',\n",
-       "       'countyname', 'cityname', 'countycode', 'citycode', 'appagencyname',\n",
-       "       'impagencyname', 'geometry', 'totalcosts', 'routes', 'constyear',\n",
-       "       'costfull', 'projagency', 'source', 'dds_phase'],\n",
-       "      dtype='object')"
-      ]
-     },
-     "execution_count": 11,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sb1_df.columns"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "646e8e9e-a61c-45f6-880f-371952dd1843",
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/html": [
-       "<div>\n",
-       "<style scoped>\n",
-       "    .dataframe tbody tr th:only-of-type {\n",
-       "        vertical-align: middle;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe tbody tr th {\n",
-       "        vertical-align: top;\n",
-       "    }\n",
-       "\n",
-       "    .dataframe thead th {\n",
-       "        text-align: right;\n",
-       "    }\n",
-       "</style>\n",
-       "<table border=\"1\" class=\"dataframe\">\n",
-       "  <thead>\n",
-       "    <tr style=\"text-align: right;\">\n",
-       "      <th></th>\n",
-       "      <th>projname</th>\n",
-       "      <th>cost</th>\n",
-       "      <th>totalcosts</th>\n",
-       "      <th>costfull</th>\n",
-       "    </tr>\n",
-       "  </thead>\n",
-       "  <tbody>\n",
-       "    <tr>\n",
-       "      <th>0</th>\n",
-       "      <td>Building Up Lossan North Improvement Program</td>\n",
-       "      <td>147930000</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>147930000.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>1</th>\n",
-       "      <td>All Aboard</td>\n",
-       "      <td>40412000</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>40412000.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>2</th>\n",
-       "      <td>DublinPleasanton Capacity Improvement And Congestion Reduction Program</td>\n",
-       "      <td>20500000</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>20500000.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>3</th>\n",
-       "      <td>Los Angeles Region Transit System Integration And Modernization Program Of Projects</td>\n",
-       "      <td>1088499000</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>1088499000.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>4</th>\n",
-       "      <td>Southwest Fresno Community Connector</td>\n",
-       "      <td>7798000</td>\n",
-       "      <td>NaN</td>\n",
-       "      <td>7798000.00</td>\n",
-       "    </tr>\n",
-       "  </tbody>\n",
-       "</table>\n",
-       "</div>"
-      ],
-      "text/plain": [
-       "                                                                              projname  \\\n",
-       "0                                         Building Up Lossan North Improvement Program   \n",
-       "1                                                                           All Aboard   \n",
-       "2               DublinPleasanton Capacity Improvement And Congestion Reduction Program   \n",
-       "3  Los Angeles Region Transit System Integration And Modernization Program Of Projects   \n",
-       "4                                                 Southwest Fresno Community Connector   \n",
-       "\n",
-       "         cost totalcosts      costfull  \n",
-       "0   147930000        NaN  147930000.00  \n",
-       "1    40412000        NaN   40412000.00  \n",
-       "2    20500000        NaN   20500000.00  \n",
-       "3  1088499000        NaN 1088499000.00  \n",
-       "4     7798000        NaN    7798000.00  "
-      ]
-     },
-     "execution_count": 12,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "sb1_df[[\"projname\", \"cost\", \"totalcosts\", \"costfull\"]].head()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "01a9b959-4980-4f01-a846-d2ee187483e7",
@@ -427,7 +303,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 11,
    "id": "1725c8ef-4f4a-4853-a0da-e522d2c66b8d",
    "metadata": {},
    "outputs": [],
@@ -441,7 +317,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
+   "execution_count": 12,
    "id": "1d61aabb-01c4-4dce-adc4-58fb8f34663a",
    "metadata": {},
    "outputs": [],
@@ -451,7 +327,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 15,
+   "execution_count": 13,
    "id": "c730a842-c9ff-42f4-a6b6-486448990623",
    "metadata": {},
    "outputs": [
@@ -581,7 +457,7 @@
        "1                  NaN                  NaN    Blackcat  Under Construction  "
       ]
      },
-     "execution_count": 15,
+     "execution_count": 13,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -592,7 +468,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 16,
+   "execution_count": 14,
    "id": "fa78dddc-c348-4ace-b04a-711adcc0c489",
    "metadata": {},
    "outputs": [],
@@ -600,6 +476,12 @@
     "def aggregate_to_one_line(\n",
     "    df: pd.DataFrame, column_to_group: str, column_to_summarize: str\n",
     "):\n",
+    "    \"\"\"\n",
+    "    Aggregate all values onto one line by one goruping val.\n",
+    "    Ex: project ABC has two rows because it has two values for the \"fund column\"\n",
+    "    as it receives money from fund 1 and fund 2. This function will\n",
+    "    combine fund 1 and fund 2 to fund 1, fund2 into one row.\n",
+    "    \"\"\"\n",
     "    df[f\"new_{column_to_summarize}\"] = df.groupby(column_to_group)[\n",
     "        column_to_summarize\n",
     "    ].transform(lambda x: \",\".join(x))\n",
@@ -611,26 +493,27 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 17,
+   "execution_count": 15,
    "id": "84638089-3c86-46fa-9911-5ac18991de5d",
    "metadata": {},
    "outputs": [],
    "source": [
     "def load_lp2000(file: str):\n",
+    "    LP2000_PATH =  \"gs://calitp-analytics-data/data-analyses/project_list/LP2000_CTIPS/\"\n",
     "    df_project = to_snakecase(\n",
-    "        pd.read_excel(f\"{GCS_FILE_PATH}{file}\", sheet_name=\"project\")\n",
+    "        pd.read_excel(f\"{LP2000_PATH}{file}\", sheet_name=\"project\")\n",
     "    )\n",
     "\n",
     "    df_county = to_snakecase(\n",
-    "        pd.read_excel(f\"{GCS_FILE_PATH}{file}\", sheet_name=\"county\")\n",
+    "        pd.read_excel(f\"{LP2000_PATH}{file}\", sheet_name=\"county\")\n",
     "    ).drop(columns=[\"project_label_name\"])\n",
     "\n",
     "    df_district = to_snakecase(\n",
-    "        pd.read_excel(f\"{GCS_FILE_PATH}{file}\", sheet_name=\"district\")\n",
+    "        pd.read_excel(f\"{LP2000_PATH}{file}\", sheet_name=\"district\")\n",
     "    ).drop(columns=[\"project_label_name\"])\n",
     "\n",
     "    df_award = to_snakecase(\n",
-    "        pd.read_excel(f\"{GCS_FILE_PATH}{file}\", sheet_name=\"awards\")\n",
+    "        pd.read_excel(f\"{LP2000_PATH}{file}\", sheet_name=\"awards\")\n",
     "    )\n",
     "\n",
     "    # Clean up awards so if project has multiple entries, this is all\n",
@@ -678,12 +561,12 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 16,
    "id": "80a62aa6-79df-411c-b87f-c18cefa37af4",
    "metadata": {},
    "outputs": [],
    "source": [
-    "lp2000_df = load_lp2000(\"LP2000.xlsx\")"
+    "lp2000_df = load_lp2000(\"LP2000_projects.xlsx\")"
    ]
   },
   {
@@ -696,7 +579,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 19,
+   "execution_count": 17,
    "id": "8192764f-6b2c-41e6-892e-0b3debace384",
    "metadata": {},
    "outputs": [],
@@ -722,7 +605,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 20,
+   "execution_count": 18,
    "id": "d11cb637-71b5-4dde-81cd-84dae5f79ff4",
    "metadata": {},
    "outputs": [],
@@ -846,7 +729,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 21,
+   "execution_count": 19,
    "id": "ef2297c5-825c-49be-a892-3081052516c4",
    "metadata": {},
    "outputs": [
@@ -854,7 +737,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
       "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
      ]
     }
@@ -883,7 +766,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 22,
+   "execution_count": 20,
    "id": "eb25aa9a-92ad-4b95-bb62-6b3b0bca3495",
    "metadata": {},
    "outputs": [
@@ -929,11 +812,11 @@
        "  </thead>\n",
        "  <tbody>\n",
        "    <tr>\n",
-       "      <th>10447</th>\n",
+       "      <th>10448</th>\n",
        "      <td>No Title</td>\n",
        "      <td>Scag</td>\n",
-       "      <td>Westlake Macarthur Park Pedestrian Improvement Project. Install Pedestrian Improvements Incl Pedestrian Lighting, Sidewalk Enhancements, Street Furniture &amp; Trees, Enhanced Crosswalks, &amp; Bus Stop Amenities.</td>\n",
-       "      <td>1674000.00</td>\n",
+       "      <td>Western Av Bus Stop &amp; Pedestrian Improvement Project. Install Pedestrian And Transit Amenities To Enhance The Pedestrian Environment Along Western Av Btw Exposition Bl &amp; I-10 Freeway.</td>\n",
+       "      <td>1472000.00</td>\n",
        "      <td>0.00</td>\n",
        "      <td>Partially Funded</td>\n",
        "      <td>None</td>\n",
@@ -945,7 +828,7 @@
        "      <td>None</td>\n",
        "      <td>None</td>\n",
        "      <td>Scag Lrtp</td>\n",
-       "      <td>notes: System: Local Highway,  Route #: 0,  Route Name: Nan,  From: Union,  To: Hoover</td>\n",
+       "      <td>notes: System: Local Highway,  Route #: 0,  Route Name: Nan,  From: Exposition,  To: I-10</td>\n",
        "      <td></td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -954,28 +837,28 @@
       ],
       "text/plain": [
        "      project_title lead_agency  \\\n",
-       "10447      No Title        Scag   \n",
+       "10448      No Title        Scag   \n",
        "\n",
-       "                                                                                                                                                                                                 project_description  \\\n",
-       "10447  Westlake Macarthur Park Pedestrian Improvement Project. Install Pedestrian Improvements Incl Pedestrian Lighting, Sidewalk Enhancements, Street Furniture & Trees, Enhanced Crosswalks, & Bus Stop Amenities.   \n",
+       "                                                                                                                                                                           project_description  \\\n",
+       "10448  Western Av Bus Stop & Pedestrian Improvement Project. Install Pedestrian And Transit Amenities To Enhance The Pedestrian Environment Along Western Av Btw Exposition Bl & I-10 Freeway.   \n",
        "\n",
        "       total_project_cost  total_available_funds             phase post_mile  \\\n",
-       "10447          1674000.00                   0.00  Partially Funded      None   \n",
+       "10448          1472000.00                   0.00  Partially Funded      None   \n",
        "\n",
        "      county  city ct_district project_start_year project_completion_year  \\\n",
-       "10447   None  None        None               None                    None   \n",
+       "10448   None  None        None               None                    None   \n",
        "\n",
        "      geometry grant_program     source  \\\n",
-       "10447     None          None  Scag Lrtp   \n",
+       "10448     None          None  Scag Lrtp   \n",
        "\n",
-       "                                                                                         notes  \\\n",
-       "10447   notes: System: Local Highway,  Route #: 0,  Route Name: Nan,  From: Union,  To: Hoover   \n",
+       "                                                                                            notes  \\\n",
+       "10448   notes: System: Local Highway,  Route #: 0,  Route Name: Nan,  From: Exposition,  To: I-10   \n",
        "\n",
        "      funding_notes  \n",
-       "10447                "
+       "10448                "
       ]
      },
-     "execution_count": 22,
+     "execution_count": 20,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -986,7 +869,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 23,
+   "execution_count": 21,
    "id": "d2d15a6b-9c44-4124-8d60-da74a9180c52",
    "metadata": {},
    "outputs": [
@@ -994,7 +877,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
       "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
      ]
     }
@@ -1032,7 +915,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 22,
    "id": "a7102189-3816-4423-9220-0dc340603b37",
    "metadata": {},
    "outputs": [
@@ -1040,7 +923,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
       "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
      ]
     }
@@ -1084,7 +967,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 23,
    "id": "5648d023-4b18-4d8c-a7d2-0a43bc105649",
    "metadata": {},
    "outputs": [
@@ -1095,7 +978,7 @@
        "Name: grant_program, dtype: int64"
       ]
      },
-     "execution_count": 25,
+     "execution_count": 23,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1106,7 +989,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 26,
+   "execution_count": 24,
    "id": "a2d74753-0c1c-4a40-a5ed-761074112b13",
    "metadata": {},
    "outputs": [
@@ -1114,7 +997,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
       "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
      ]
     }
@@ -1151,7 +1034,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 27,
+   "execution_count": 25,
    "id": "d588d001-9707-472f-a9bb-5dbf2cfd0d95",
    "metadata": {},
    "outputs": [
@@ -1159,7 +1042,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
+      "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n",
       "  cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n"
      ]
     }
@@ -1201,22 +1084,23 @@
    "metadata": {},
    "source": [
     "### Stack\n",
-    "TO DO\n",
+    "Waiting\n",
     "* Clarify the monetary cols of SB1 & BlackCat\n",
-    "* Harmonize county/city/lead agency names\n",
-    "* LRTP grant program should be none'\n",
-    "* Categorize it?\n",
     "\n",
     "Christian's Notes\n",
     "* What amount of transit related projects are in this big list? \n",
     "* How big the projects are by cost? \n",
     "* Compare the cost of all the transit projects against all the projects in the list?\n",
-    "* Use percentages."
+    "* Use percentages.\n",
+    "\n",
+    "Done\n",
+    "* Harmonize county/city/lead agency names\n",
+    "* LRTP grant program should be none'"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 28,
+   "execution_count": 26,
    "id": "2ebe704e-3375-4be9-bb2c-5ee6079ba0d3",
    "metadata": {},
    "outputs": [],
@@ -1235,7 +1119,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 78,
+   "execution_count": 27,
    "id": "189437cf-117a-4b8a-a89f-ae68fe988cc4",
    "metadata": {},
    "outputs": [],
@@ -1251,17 +1135,17 @@
     "    df[column] = df[column].replace(r\"\\s+\", \" \", regex=True)\n",
     "\n",
     "    # Remove specific characters\n",
-    "    chars_to_remove = [\"-\", \"/\", \")\", \"(\", \".\", 'County', 'Of','District']\n",
+    "    chars_to_remove = [\"-\", \"/\", \")\", \"(\", \".\", \"County\", \"Of\", \"District\"]\n",
     "    for char in chars_to_remove:\n",
     "        df[column] = df[column].str.replace(char, \"\")\n",
-    "    \n",
-    "    df[column] = df[column].astype(str).replace('\\d+', '', regex=True)\n",
+    "\n",
+    "    df[column] = df[column].astype(str).replace(\"\\d+\", \"\", regex=True)\n",
     "    return df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 79,
+   "execution_count": 28,
    "id": "6c911fdc-7bf4-4529-91bd-dcfe6667ee78",
    "metadata": {},
    "outputs": [
@@ -1269,28 +1153,28 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/tmp/ipykernel_928/180325038.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
+      "/tmp/ipykernel_2642/309772486.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
       "  df[column] = df[column].str.replace(char, \"\")\n"
      ]
     }
    ],
    "source": [
-    "complete = clean_strings(complete, 'lead_agency')"
+    "complete = clean_strings(complete, \"lead_agency\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 80,
+   "execution_count": 29,
    "id": "7d458a7b-63ee-428a-9a81-61cf32b88e7d",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "1031"
+       "1056"
       ]
      },
-     "execution_count": 80,
+     "execution_count": 29,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1301,17 +1185,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 29,
+   "execution_count": 30,
    "id": "ccaacd62-b205-42d7-9338-bba3fdb27404",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "29381"
+       "29420"
       ]
      },
-     "execution_count": 29,
+     "execution_count": 30,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1322,14 +1206,14 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 30,
+   "execution_count": 31,
    "id": "4d391464-d622-44ae-99ec-3f6829ccc589",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "LP2000                11233\n",
+       "LP2000                11272\n",
        "Blackcat               3385\n",
        "Fresno Cog Lrtp        3147\n",
        "Scag Lrtp              2952\n",
@@ -1354,7 +1238,7 @@
        "Name: source, dtype: int64"
       ]
      },
-     "execution_count": 30,
+     "execution_count": 31,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1363,276 +1247,389 @@
     "complete.source.value_counts()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "3be25882-b1fb-47a5-9c3e-7fb9ea11e37b",
+   "metadata": {},
+   "source": [
+    "#### Try to find duplicated projects"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 58,
-   "id": "a6a748ee-6712-48fc-b607-4501823d3e58",
+   "execution_count": 32,
+   "id": "f47bff65-2a07-41b1-b5ee-8bf27de8b1fc",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_title</th>\n",
+       "      <th>lead_agency</th>\n",
+       "      <th>project_description</th>\n",
+       "      <th>total_project_cost</th>\n",
+       "      <th>total_available_funds</th>\n",
+       "      <th>phase</th>\n",
+       "      <th>post_mile</th>\n",
+       "      <th>county</th>\n",
+       "      <th>city</th>\n",
+       "      <th>ct_district</th>\n",
+       "      <th>project_start_year</th>\n",
+       "      <th>project_completion_year</th>\n",
+       "      <th>geometry</th>\n",
+       "      <th>grant_program</th>\n",
+       "      <th>source</th>\n",
+       "      <th>notes</th>\n",
+       "      <th>funding_notes</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>12136</th>\n",
+       "      <td>No Title</td>\n",
+       "      <td>Scag</td>\n",
+       "      <td>Widen Riverside Dr From Pipeline Ave To Fern Ave From 4 To 6 Lanes</td>\n",
+       "      <td>5089000.00</td>\n",
+       "      <td>0.00</td>\n",
+       "      <td>Partially Funded</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>None</td>\n",
+       "      <td>Scag Lrtp</td>\n",
+       "      <td>notes: System: Local Highway,  Route #: 0,  Route Name: Riverside Dr,  From: Pipeline Ave,  To: Fern Ave</td>\n",
+       "      <td></td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      project_title lead_agency  \\\n",
+       "12136      No Title        Scag   \n",
+       "\n",
+       "                                                      project_description  \\\n",
+       "12136  Widen Riverside Dr From Pipeline Ave To Fern Ave From 4 To 6 Lanes   \n",
+       "\n",
+       "       total_project_cost  total_available_funds             phase post_mile  \\\n",
+       "12136          5089000.00                   0.00  Partially Funded      None   \n",
+       "\n",
+       "      county  city ct_district project_start_year project_completion_year  \\\n",
+       "12136   None  None        None               None                    None   \n",
+       "\n",
+       "      geometry grant_program     source  \\\n",
+       "12136     None          None  Scag Lrtp   \n",
+       "\n",
+       "                                                                                                           notes  \\\n",
+       "12136   notes: System: Local Highway,  Route #: 0,  Route Name: Riverside Dr,  From: Pipeline Ave,  To: Fern Ave   \n",
+       "\n",
+       "      funding_notes  \n",
+       "12136                "
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "complete.sample()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 33,
+   "id": "57a6ba28-e795-4bf2-a1b3-83f747d5c7b2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "projects_main_info = complete.project_title + '-' + complete.project_description  + '-' + complete.source  + '-' + complete.county  + '-' + complete.notes + '-' + complete.total_project_cost.astype(str)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "id": "d22e2d61-847f-4c7d-b542-04fb1ef65b8a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "1062"
+       "pandas.core.series.Series"
       ]
      },
-     "execution_count": 58,
+     "execution_count": 34,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "len(complete[['lead_agency']].sort_values(by = ['lead_agency']).drop_duplicates())"
+    "type(projects_main_info)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 134,
-   "id": "b1de1321-4c54-4acd-b649-4a969d0b02a5",
+   "execution_count": 35,
+   "id": "8c79d083-50cd-4baf-acd9-02279e255ac6",
    "metadata": {},
    "outputs": [],
    "source": [
-    "transit_list = [\n",
-    "    \"buses\",\n",
-    "    \"van\",\n",
-    "    \"light rail\",\n",
-    "    \"light rail vehicles\",\n",
-    "    \"lrv\",\n",
-    "    \"train\",\n",
-    "    \"bus\",\n",
-    "    \"rail\",\n",
-    "    \"locomotives\",\n",
-    "    \"ferry\",\n",
-    "    \"vessels\",\n",
-    "    \"trolley\",\n",
-    "    \"vehicles\",\n",
-    "    \"emus\",\n",
-    "    \"trolleys\",\n",
-    "    \"turnouts\",\n",
-    "    \"routes\",\n",
-    "    \"station\",\n",
-    "    \"signals\",\n",
-    "    \"facility\",\n",
-    "    \"locations\",\n",
-    "    \"congestion\",\n",
-    "    \"rideshare\",\n",
-    "    \"ridesharing\",\n",
-    "    \"vanpool\",\n",
-    "    \"high quality transit areas\",\n",
-    "    \"hqta\",\n",
-    "    \"car share\",\n",
-    "    \"bus\",\n",
-    "    \"metro\",\n",
-    "    \"station\",  # Station comes up a few times as a charging station and also as a train station\n",
-    "    \"transit\",\n",
-    "    \"fare\",\n",
-    "    \"brt\",\n",
-    "    \"yarts\",\n",
-    "    \"railroad\",\n",
-    "    \"rider\",\n",
-    "    \"highway-rail\",\n",
-    "    \"bike\",\n",
-    "    \"bicycle\",\n",
-    "     'bus rapid transit',\n",
-    "    'transit-oriented development',\n",
-    "    'commuter rail',\n",
-    "    'bus stop',\n",
-    "    'shuttle',\n",
-    "    'mobility hub',\n",
-    "    'fare evasion',\n",
-    "    'park and ride',\n",
-    "    'bus lane',\n",
-    "    'bicycle lane',\n",
-    "    'multimodal',\n",
-    "    'farebox',\n",
-    "    'transfer',\n",
-    "    'intermodal',\n",
-    "    'paratransit',\n",
-    "    'bus route',\n",
-    "    'express bus',\n",
-    "    'bus terminal',\n",
-    "    'bus shelter',\n",
-    "    'bus depot',\n",
-    "    'bus service',\n",
-    "    'transit agency',\n",
-    "    'fare collection',\n",
-    "    'fare structure',\n",
-    "    'fare card',\n",
-    "    'transit signal priority',\n",
-    "    'bus rapid transit',\n",
-    "    'fare integration',\n",
-    "    'transportation equity',\n",
-    "    'mobility as a service',\n",
-    "    'fare subsidy',\n",
-    "    'fare payment',\n",
-    "    'integrated transit',\n",
-    "    'automated transit',\n",
-    "    'fare technology',\n",
-    "    'real-time transit',\n",
-    "    'mobility management',\n",
-    "    'bus network',\n",
-    "    'rail network',\n",
-    "    'public transportation',\n",
-    "    'commute',\n",
-    "    \"cyclist\",\n",
-    "    \"pedestrian\",\n",
-    "    ## including the spelling errors of `pedestrian`\n",
-    "    \"pedestrain\",\n",
-    "    \"crosswalk\",\n",
-    "    \"bulb out\",\n",
-    "    \"bulb-out\",\n",
-    "    \"active transp\",\n",
-    "    \"traffic reduction\",\n",
-    "    \"speed reduction\",\n",
-    "    \"ped\",\n",
-    "    \"srts\",\n",
-    "    \"safe routes to school\",\n",
-    "    \"sidewalk\",\n",
-    "    \"side walk\",\n",
-    "    \"trail\",\n",
-    "    \"atp\",\n",
-    "]"
+    "\n",
+    "main_info = projects_main_info.to_frame()"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 135,
-   "id": "d112f04a-8aac-4871-9b84-9daba4b596b7",
+   "execution_count": 36,
+   "id": "92a926d4-90c8-41a3-9b62-19ab0d81ba2e",
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Remove duplicates\n",
-    "cleaned_transit_list = list(set(transit_list))"
+    "main_info = main_info.rename(columns = {0:'project_info'})"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 136,
-   "id": "0fa39db5-a4f8-4702-8774-254aa167181e",
+   "execution_count": 37,
+   "id": "b607cb21-0735-46ee-9f32-840d31e09d9f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "['intermodal',\n",
-       " 'transfer',\n",
-       " 'vanpool',\n",
-       " 'pedestrian',\n",
-       " 'bulb-out',\n",
-       " 'integrated transit',\n",
-       " 'mobility management',\n",
-       " 'trolleys',\n",
-       " 'bus depot',\n",
-       " 'congestion',\n",
-       " 'sidewalk',\n",
-       " 'shuttle',\n",
-       " 'bus shelter',\n",
-       " 'trolley',\n",
-       " 'fare evasion',\n",
-       " 'signals',\n",
-       " 'metro',\n",
-       " 'brt',\n",
-       " 'mobility as a service',\n",
-       " 'fare structure',\n",
-       " 'van',\n",
-       " 'light rail vehicles',\n",
-       " 'rail network',\n",
-       " 'bulb out',\n",
-       " 'bus terminal',\n",
-       " 'lrv',\n",
-       " 'bicycle lane',\n",
-       " 'pedestrain',\n",
-       " 'yarts',\n",
-       " 'rideshare',\n",
-       " 'car share',\n",
-       " 'trail',\n",
-       " 'park and ride',\n",
-       " 'fare integration',\n",
-       " 'crosswalk',\n",
-       " 'ridesharing',\n",
-       " 'paratransit',\n",
-       " 'commuter rail',\n",
-       " 'speed reduction',\n",
-       " 'multimodal',\n",
-       " 'turnouts',\n",
-       " 'srts',\n",
-       " 'rider',\n",
-       " 'side walk',\n",
-       " 'fare subsidy',\n",
-       " 'transit signal priority',\n",
-       " 'train',\n",
-       " 'transportation equity',\n",
-       " 'rail',\n",
-       " 'commute',\n",
-       " 'light rail',\n",
-       " 'bus route',\n",
-       " 'safe routes to school',\n",
-       " 'fare collection',\n",
-       " 'ped',\n",
-       " 'buses',\n",
-       " 'locations',\n",
-       " 'cyclist',\n",
-       " 'farebox',\n",
-       " 'public transportation',\n",
-       " 'high quality transit areas',\n",
-       " 'transit-oriented development',\n",
-       " 'emus',\n",
-       " 'facility',\n",
-       " 'transit agency',\n",
-       " 'real-time transit',\n",
-       " 'railroad',\n",
-       " 'routes',\n",
-       " 'active transp',\n",
-       " 'atp',\n",
-       " 'vessels',\n",
-       " 'automated transit',\n",
-       " 'highway-rail',\n",
-       " 'bus rapid transit',\n",
-       " 'fare payment',\n",
-       " 'fare',\n",
-       " 'bus lane',\n",
-       " 'mobility hub',\n",
-       " 'transit',\n",
-       " 'traffic reduction',\n",
-       " 'ferry',\n",
-       " 'bus stop',\n",
-       " 'bus network',\n",
-       " 'fare technology',\n",
-       " 'express bus',\n",
-       " 'bicycle',\n",
-       " 'bike',\n",
-       " 'locomotives',\n",
-       " 'station',\n",
-       " 'bus',\n",
-       " 'bus service',\n",
-       " 'vehicles',\n",
-       " 'fare card',\n",
-       " 'hqta']"
+       "29420"
       ]
      },
-     "execution_count": 136,
+     "execution_count": 37,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "cleaned_transit_list"
+    "len(main_info)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 137,
-   "id": "c959117b-091d-4610-b36f-202f3dd97c9e",
+   "execution_count": 38,
+   "id": "958fabea-d9c7-4e70-9c18-53ee5792606d",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def filter_projects(\n",
-    "    df,\n",
-    "    columns_to_search: list,\n",
-    "    keywords_search: list,\n",
-    "    file_name: str,\n",
-    "    gcs_path: str,\n",
-    "    projects_to_del: list,\n",
-    "):\n",
+    "# Assuming main_info.project_info.value_counts() gives you a Series\n",
+    "value_counts_series = main_info.project_info.value_counts()\n",
+    "\n",
+    "# Convert the Series to a DataFrame with columns 'value' and 'count'\n",
+    "df_value_counts = value_counts_series.reset_index()\n",
+    "df_value_counts.columns = ['project_info', 'total_values']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "id": "063ff71f-032a-4564-8398-cfbd982859ab",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "28228"
+      ]
+     },
+     "execution_count": 39,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df_value_counts)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 40,
+   "id": "35cf6fe5-dba8-46c4-8a47-15d4eabac99f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "count   28228.00\n",
+       "mean        1.04\n",
+       "std         0.62\n",
+       "min         1.00\n",
+       "25%         1.00\n",
+       "50%         1.00\n",
+       "75%         1.00\n",
+       "max        34.00\n",
+       "Name: total_values, dtype: float64"
+      ]
+     },
+     "execution_count": 40,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_value_counts.total_values.describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "id": "afd35c1b-d15b-4b7a-98e4-2c8ba9ab175b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "403"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(df_value_counts.loc[df_value_counts.total_values > 1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "id": "320fe207-3176-4f70-85a5-52b627d12dc1",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>project_info</th>\n",
+       "      <th>total_values</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>None-Purchase Replacement Van-Blackcat-None- grant fiscal year: 2019,  grant encumbered amount: 56000.0,  local encumbered amount: 0.0,  total encumbered amount: 56000.0,  expendedamount: 0.0,  activebalance: 44800.0,  closedoutbalance: 0,  project status: Open-0.0</td>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>None-Purchase Replacement Van-Blackcat-None- grant fiscal year: 2019,  grant encumbered amount: 56000.0,  local encumbered amount: 0.0,  total encumbered amount: 56000.0,  expendedamount: 0.0,  activebalance: 11200.0,  closedoutbalance: 0,  project status: Open-0.0</td>\n",
+       "      <td>34</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>None-None-Sandag Lrtp-None- notes: Category: Nan,  Status: Nan,  Aqc 2016 1: Nan,  Aqc 2020 1: Nan,  Pricmcp: Nan,  Conncmcp: Nan,  Layer Name: Mobility Hubs And Flexible Fleets,  Corridor I: Nan,  Type 1: Nan,  Existing: Nan,  Limits: Nan,  Description 1: Nan,  Route: Nan,  Routetype: Nan,  Route Desc: Nan,  Rp 2021 Id: Nan,  Rp 2021 Id 1: Nan,  Capital Cost   2020  Millions: Nan-0.0</td>\n",
+       "      <td>31</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>None-Purchase Replacement &lt; 30 Ft Bus-Blackcat-None- grant fiscal year: 2021,  grant encumbered amount: 84000.0,  local encumbered amount: 0.0,  total encumbered amount: 84000.0,  expendedamount: 0.0,  activebalance: 84000.0,  closedoutbalance: 0,  project status: Open-0.0</td>\n",
+       "      <td>29</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Emergency Opening-None-LP2000-Tulare County- location name: nan,  route name: 0-CR,  state hwy ind: N,  senate district code: nan,  category desc: Emergency Opening,  district code: 6.0,  comment desc: Emergency Opening,  postmile combined: nan-0.0</td>\n",
+       "      <td>22</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                                                                                                                                                                                                                                                                                                                                                          project_info  \\\n",
+       "0                                                                                                                            None-Purchase Replacement Van-Blackcat-None- grant fiscal year: 2019,  grant encumbered amount: 56000.0,  local encumbered amount: 0.0,  total encumbered amount: 56000.0,  expendedamount: 0.0,  activebalance: 44800.0,  closedoutbalance: 0,  project status: Open-0.0   \n",
+       "1                                                                                                                            None-Purchase Replacement Van-Blackcat-None- grant fiscal year: 2019,  grant encumbered amount: 56000.0,  local encumbered amount: 0.0,  total encumbered amount: 56000.0,  expendedamount: 0.0,  activebalance: 11200.0,  closedoutbalance: 0,  project status: Open-0.0   \n",
+       "2  None-None-Sandag Lrtp-None- notes: Category: Nan,  Status: Nan,  Aqc 2016 1: Nan,  Aqc 2020 1: Nan,  Pricmcp: Nan,  Conncmcp: Nan,  Layer Name: Mobility Hubs And Flexible Fleets,  Corridor I: Nan,  Type 1: Nan,  Existing: Nan,  Limits: Nan,  Description 1: Nan,  Route: Nan,  Routetype: Nan,  Route Desc: Nan,  Rp 2021 Id: Nan,  Rp 2021 Id 1: Nan,  Capital Cost   2020  Millions: Nan-0.0   \n",
+       "3                                                                                                                    None-Purchase Replacement < 30 Ft Bus-Blackcat-None- grant fiscal year: 2021,  grant encumbered amount: 84000.0,  local encumbered amount: 0.0,  total encumbered amount: 84000.0,  expendedamount: 0.0,  activebalance: 84000.0,  closedoutbalance: 0,  project status: Open-0.0   \n",
+       "4                                                                                                                                             Emergency Opening-None-LP2000-Tulare County- location name: nan,  route name: 0-CR,  state hwy ind: N,  senate district code: nan,  category desc: Emergency Opening,  district code: 6.0,  comment desc: Emergency Opening,  postmile combined: nan-0.0   \n",
+       "\n",
+       "   total_values  \n",
+       "0            34  \n",
+       "1            34  \n",
+       "2            31  \n",
+       "3            29  \n",
+       "4            22  "
+      ]
+     },
+     "execution_count": 42,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_value_counts.loc[df_value_counts.total_values > 1].sort_values(by = ['total_values'], ascending = False).head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "efe2427d-adfb-47ca-994e-83f442dedfa3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# complete.loc[(complete.source == \"LP2000\") & (complete.project_title == \"Emergency Opening\")  & (complete.county == \"Tulare County\")]."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "c959117b-091d-4610-b36f-202f3dd97c9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def filter_projects(\n",
+    "    df,\n",
+    "    columns_to_search: list,\n",
+    "    keywords_search: list,\n",
+    "    file_name: str,\n",
+    "    gcs_path: str,\n",
+    "    projects_to_del: list,\n",
+    "):\n",
     "\n",
     "    # Filter out for Cordon\n",
     "    df = _specific_list_utils.find_keywords(df, columns_to_search, keywords_search)\n",
@@ -1661,7 +1658,97 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 138,
+   "execution_count": 45,
+   "id": "e3325be6-ab38-4dd9-9a7f-17c05972bef7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "transit_terms = [\n",
+    "    \"automated transit\",\n",
+    "    \"brt\",\n",
+    "    \"bus\",\n",
+    "    \"bus depot\",\n",
+    "    \"bus lane\",\n",
+    "    \"bus lanes\",\n",
+    "    \"bus network\",\n",
+    "    \"bus rapid transit\",\n",
+    "    \"bus route\",\n",
+    "    \"bus routes\",\n",
+    "    \"bus service\",\n",
+    "    \"bus shelter\",\n",
+    "    \"bus stop\",\n",
+    "    \"bus terminal\",\n",
+    "    \"buses\",\n",
+    "    \"commuter rail\",\n",
+    "    \"express bus\",\n",
+    "    \"fare card\",\n",
+    "    \"fare collection\",\n",
+    "    \"fare evasion\",\n",
+    "    \"fare integration\",\n",
+    "    \"fare payment\",\n",
+    "    \"fare structure\",\n",
+    "    \"fare subsidy\",\n",
+    "    \"fare technology\",\n",
+    "    \"farebox\",\n",
+    "    \"ferry\",\n",
+    "    \"ferrys\",\n",
+    "    \"high quality transit areas\",\n",
+    "    \"integrated transit\",\n",
+    "    \"intermodal\",\n",
+    "    \"light rail\",\n",
+    "    \"light rail vehicles\",\n",
+    "    \"locomotives\",\n",
+    "    \"mobility as a service\",\n",
+    "    \"mobility hub\",\n",
+    "    \"multimodal\",\n",
+    "    \"paratransit\",\n",
+    "    \"rail\",\n",
+    "    \"rail network\",\n",
+    "    \"railroad\",\n",
+    "    \"shuttle\",\n",
+    "    \"shuttles\",\n",
+    "    \"station\",\n",
+    "    \"terminal\",\n",
+    "    \"train\",\n",
+    "    \"trains\",\n",
+    "    \"transit\",\n",
+    "    \"transit agency\",\n",
+    "    \"transit center\",\n",
+    "    \"transit hub\",\n",
+    "    \"transit signal priority\",\n",
+    "    \"transit-oriented development\",\n",
+    "    \"transportation equity\",\n",
+    "    \"trolley\",\n",
+    "    \"trolleys\",\n",
+    "    \"van\",\n",
+    "    \"vans\"\n",
+    "]\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "e00e746a-acdb-4232-8916-24159e10c7fe",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "58"
+      ]
+     },
+     "execution_count": 46,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(transit_terms)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
    "id": "44233fea-c8a1-4f48-b96c-1f74b58b083c",
    "metadata": {},
    "outputs": [
@@ -1669,6 +1756,8 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/home/jovyan/data-analyses/project_list/_specific_list_utils.py:18: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
+      "  df[i]\n",
       "/home/jovyan/data-analyses/project_list/_specific_list_utils.py:18: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n",
       "  df[i]\n"
      ]
@@ -1681,7 +1770,7 @@
     "        \"project_title\",\n",
     "        \"project_description\",\n",
     "    ],\n",
-    "    cleaned_transit_list,\n",
+    "    transit_terms,\n",
     "    \"sb125_transit\",\n",
     "    GCS_FILE_PATH,\n",
     "    [],\n",
@@ -1690,17 +1779,17 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 139,
+   "execution_count": 48,
    "id": "45ac2172-9b5f-4506-b082-3277e7ddb280",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "(10672, 19)"
+       "(4186, 19)"
       ]
      },
-     "execution_count": 139,
+     "execution_count": 48,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1719,28 +1808,28 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 140,
+   "execution_count": 49,
    "id": "74c46a32-087b-41d1-801b-289a1ae54b90",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'36% of projects are related to Transit'"
+       "'14% or 4186 of projects in this list are related to Transit'"
       ]
      },
-     "execution_count": 140,
+     "execution_count": 49,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "f\"{(int(len(transit_df) / len(complete) * 100))}% of projects are related to Transit\""
+    "f\"{(int(len(transit_df) / len(complete) * 100))}% or {len(transit_df)} of projects in this list are related to Transit\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 141,
+   "execution_count": 50,
    "id": "a83ec341-f24c-4e47-800a-1889c48c9d8a",
    "metadata": {},
    "outputs": [],
@@ -1750,7 +1839,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 142,
+   "execution_count": 51,
    "id": "a9c90ab2-be8a-44a3-8dad-372a0b9e762e",
    "metadata": {},
    "outputs": [],
@@ -1760,7 +1849,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 143,
+   "execution_count": 52,
    "id": "9aa09882-5c2c-40f0-a93a-e74abd5d2916",
    "metadata": {},
    "outputs": [],
@@ -1770,7 +1859,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 144,
+   "execution_count": 53,
    "id": "ec85ce97-a0d7-469f-9138-caba4711a37d",
    "metadata": {},
    "outputs": [],
@@ -1780,60 +1869,60 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 145,
+   "execution_count": 54,
    "id": "a33a45f1-3047-4178-9c24-6a2384fece0a",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "'The total estimated cost is $333,288,985,140 compared to $669,784,021,877 in all the compiled projects. This makes up 49% of the requested funding'"
+       "'The total estimated cost is $299,572,489,073 compared to $670,035,689,953 in all the compiled projects. This makes up 44% of the requested funding (that we have on file).'"
       ]
      },
-     "execution_count": 145,
+     "execution_count": 54,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "f\"The total estimated cost is ${formatted_total_cost} compared to ${formatted_projects_cost} in all the compiled projects. This makes up {int((transit_cost/total_projects_cost) * 100)}% of the requested funding\""
+    "f\"The total estimated cost is ${formatted_total_cost} compared to ${formatted_projects_cost} in all the compiled projects. This makes up {int((transit_cost/total_projects_cost) * 100)}% of the requested funding (that we have on file).\""
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 146,
+   "execution_count": 55,
    "id": "57514f7f-cece-467b-a4c2-d4a891c1878f",
    "metadata": {},
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "LP2000                3216\n",
-       "Blackcat              1596\n",
-       "Scag Lrtp             1144\n",
-       "Fresno Cog Lrtp       1047\n",
-       "Kern Cog Lrtp          955\n",
-       "Sacog Lrtp             557\n",
-       "Madera Ctc Lrtp        300\n",
-       "Stancog Lrtp           240\n",
-       "Ambag Lrtp             227\n",
-       "Sbcag Lrtp             223\n",
-       "Slocog Lrtp            196\n",
-       "State Rail Plan        169\n",
-       "Scrtpa Lrtp            136\n",
-       "Tcag Lrtp              135\n",
-       "Mtc Lrtp               112\n",
-       "Bcag Lrtp              108\n",
-       "Sandag Lrtp             90\n",
-       "Sjcog Lrtp              87\n",
-       "Tmpo Lrtp               55\n",
-       "SB1 Feature Server      51\n",
-       "Mcagov Lrtp             26\n",
-       "Kcag Lrtp                2\n",
+       "Blackcat              1459\n",
+       "Scag Lrtp              660\n",
+       "LP2000                 639\n",
+       "Sacog Lrtp             201\n",
+       "Madera Ctc Lrtp        199\n",
+       "Fresno Cog Lrtp        181\n",
+       "State Rail Plan        161\n",
+       "Mtc Lrtp               107\n",
+       "Sbcag Lrtp              95\n",
+       "Sjcog Lrtp              63\n",
+       "Sandag Lrtp             63\n",
+       "Scrtpa Lrtp             60\n",
+       "SB1 Feature Server      50\n",
+       "Stancog Lrtp            49\n",
+       "Kern Cog Lrtp           45\n",
+       "Slocog Lrtp             35\n",
+       "Tmpo Lrtp               31\n",
+       "Bcag Lrtp               26\n",
+       "Ambag Lrtp              26\n",
+       "Tcag Lrtp               25\n",
+       "Mcagov Lrtp             10\n",
+       "Kcag Lrtp                1\n",
        "Name: source, dtype: int64"
       ]
      },
-     "execution_count": 146,
+     "execution_count": 55,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1842,79 +1931,95 @@
     "transit_df.source.value_counts()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "e349dfc3-bd19-45d9-af24-6998814a77a2",
+   "metadata": {},
+   "source": [
+    "#### Keywords that were picked up"
+   ]
+  },
   {
    "cell_type": "code",
-   "execution_count": 147,
+   "execution_count": 56,
    "id": "0723a569-2807-4cfe-a552-4259132ef40a",
    "metadata": {},
    "outputs": [],
    "source": [
-    "def count_categories(df:pd.DataFrame, column:str):\n",
+    "def count_categories(df: pd.DataFrame, column: str):\n",
     "    # Convert the result to a DataFrame\n",
-    "    filtered_df = df.loc[df[column] != 'keyword not found'][[column]].value_counts()\n",
-    "    result_df = pd.DataFrame(filtered_df, columns=['Count'])\n",
+    "    filtered_df = df.loc[df[column] != \"keyword not found\"][[column]].value_counts()\n",
+    "    result_df = pd.DataFrame(filtered_df, columns=[\"Count\"])\n",
     "\n",
     "    # Reset the index to make the keyword a regular column\n",
     "    result_df = result_df.reset_index()\n",
     "\n",
     "    # Rename the columns if needed\n",
-    "    result_df.columns = ['Keyword', 'Count']\n",
+    "    result_df.columns = [\"Keyword\", \"Count\"]\n",
     "    return result_df"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 148,
+   "execution_count": 57,
    "id": "9d34f03f-0685-496b-816f-4a435e75f56c",
    "metadata": {},
    "outputs": [],
    "source": [
-    "proj_desc = count_categories(transit_df, 'lower_case_project_description_keyword_search')"
+    "proj_desc = count_categories(\n",
+    "    transit_df, \"lower_case_project_description_keyword_search\"\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 149,
+   "execution_count": 58,
    "id": "6f9c0470-a9b1-47fb-aa1b-10d864b3da34",
    "metadata": {},
    "outputs": [],
    "source": [
-    "title = count_categories(transit_df, 'lower_case_project_title_keyword_search')"
+    "title = count_categories(transit_df, \"lower_case_project_title_keyword_search\")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 150,
+   "execution_count": 59,
    "id": "c31bfc4d-7acc-402b-a22f-0f9c5bb74b90",
    "metadata": {},
    "outputs": [],
    "source": [
-    "categories = pd.merge(proj_desc, title, on = 'Keyword', how = 'outer', indicator = True)"
+    "categories = pd.merge(proj_desc, title, on=\"Keyword\", how=\"outer\", indicator=True)"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 151,
+   "execution_count": 60,
    "id": "30c575f5-3cee-4f84-86ec-c491564c120b",
    "metadata": {},
    "outputs": [],
    "source": [
-    "categories['Total Projects'] = categories.Count_x.fillna(0) + categories.Count_y.fillna(0)"
+    "categories[\"Total Projects\"] = categories.Count_x.fillna(0) + categories.Count_y.fillna(\n",
+    "    0\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 152,
+   "execution_count": 61,
    "id": "5789e0cc-751c-4471-919e-6321cf3ff3fc",
    "metadata": {},
    "outputs": [],
    "source": [
-    "categories = categories.sort_values(by = ['Total Projects'], ascending = False).reset_index(drop = True).drop(columns = ['Count_x','Count_y','_merge'])"
+    "categories = (\n",
+    "    categories.sort_values(by=[\"Total Projects\"], ascending=False)\n",
+    "    .reset_index(drop=True)\n",
+    "    .drop(columns=[\"Count_x\", \"Count_y\", \"_merge\"])\n",
+    ")"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 153,
+   "execution_count": 62,
    "id": "f488ab7a-9a99-4890-ad9f-fc0e3209c5f7",
    "metadata": {},
    "outputs": [
@@ -1946,362 +2051,162 @@
        "  <tbody>\n",
        "    <tr>\n",
        "      <th>0</th>\n",
-       "      <td>bike</td>\n",
-       "      <td>2771.00</td>\n",
+       "      <td>bus</td>\n",
+       "      <td>1725.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>1</th>\n",
-       "      <td>bus</td>\n",
-       "      <td>1578.00</td>\n",
+       "      <td>transit</td>\n",
+       "      <td>959.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>2</th>\n",
-       "      <td>pedestrian</td>\n",
-       "      <td>1297.00</td>\n",
+       "      <td>van</td>\n",
+       "      <td>408.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>3</th>\n",
-       "      <td>transit</td>\n",
-       "      <td>838.00</td>\n",
+       "      <td>rail</td>\n",
+       "      <td>375.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>4</th>\n",
-       "      <td>sidewalk</td>\n",
-       "      <td>719.00</td>\n",
+       "      <td>buses</td>\n",
+       "      <td>259.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>5</th>\n",
-       "      <td>trail</td>\n",
-       "      <td>533.00</td>\n",
+       "      <td>station</td>\n",
+       "      <td>251.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>6</th>\n",
-       "      <td>bicycle</td>\n",
-       "      <td>466.00</td>\n",
+       "      <td>railroad</td>\n",
+       "      <td>209.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>7</th>\n",
-       "      <td>signals</td>\n",
-       "      <td>453.00</td>\n",
+       "      <td>paratransit</td>\n",
+       "      <td>86.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>8</th>\n",
-       "      <td>van</td>\n",
-       "      <td>407.00</td>\n",
+       "      <td>multimodal</td>\n",
+       "      <td>78.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>9</th>\n",
-       "      <td>rail</td>\n",
-       "      <td>326.00</td>\n",
+       "      <td>light rail</td>\n",
+       "      <td>73.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>10</th>\n",
-       "      <td>buses</td>\n",
-       "      <td>240.00</td>\n",
+       "      <td>ferry</td>\n",
+       "      <td>64.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>11</th>\n",
-       "      <td>station</td>\n",
-       "      <td>212.00</td>\n",
+       "      <td>intermodal</td>\n",
+       "      <td>41.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>12</th>\n",
-       "      <td>transfer</td>\n",
-       "      <td>195.00</td>\n",
+       "      <td>brt</td>\n",
+       "      <td>37.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>13</th>\n",
-       "      <td>locations</td>\n",
-       "      <td>194.00</td>\n",
+       "      <td>train</td>\n",
+       "      <td>34.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>14</th>\n",
-       "      <td>railroad</td>\n",
-       "      <td>175.00</td>\n",
+       "      <td>terminal</td>\n",
+       "      <td>31.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>15</th>\n",
-       "      <td>facility</td>\n",
-       "      <td>164.00</td>\n",
+       "      <td>commuter rail</td>\n",
+       "      <td>30.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
        "      <th>16</th>\n",
-       "      <td>mobility management</td>\n",
-       "      <td>134.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>17</th>\n",
-       "      <td>crosswalk</td>\n",
-       "      <td>103.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>18</th>\n",
-       "      <td>ped</td>\n",
-       "      <td>99.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>19</th>\n",
-       "      <td>vehicles</td>\n",
-       "      <td>89.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>20</th>\n",
-       "      <td>paratransit</td>\n",
-       "      <td>83.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>21</th>\n",
-       "      <td>multimodal</td>\n",
-       "      <td>73.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>22</th>\n",
-       "      <td>rideshare</td>\n",
-       "      <td>67.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>23</th>\n",
-       "      <td>srts</td>\n",
-       "      <td>63.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>24</th>\n",
-       "      <td>light rail</td>\n",
-       "      <td>62.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>25</th>\n",
-       "      <td>ferry</td>\n",
-       "      <td>61.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>26</th>\n",
-       "      <td>routes</td>\n",
-       "      <td>60.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>27</th>\n",
-       "      <td>bus stop</td>\n",
-       "      <td>55.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>28</th>\n",
-       "      <td>congestion</td>\n",
-       "      <td>53.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>29</th>\n",
-       "      <td>bicycle lane</td>\n",
-       "      <td>49.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>30</th>\n",
-       "      <td>safe routes to school</td>\n",
-       "      <td>43.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>31</th>\n",
-       "      <td>metro</td>\n",
-       "      <td>42.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>32</th>\n",
-       "      <td>intermodal</td>\n",
-       "      <td>41.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>33</th>\n",
-       "      <td>brt</td>\n",
-       "      <td>34.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>34</th>\n",
-       "      <td>atp</td>\n",
-       "      <td>33.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>35</th>\n",
-       "      <td>train</td>\n",
-       "      <td>31.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>36</th>\n",
-       "      <td>park and ride</td>\n",
-       "      <td>29.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>37</th>\n",
        "      <td>express bus</td>\n",
        "      <td>29.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>38</th>\n",
-       "      <td>commuter rail</td>\n",
-       "      <td>29.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>39</th>\n",
-       "      <td>vanpool</td>\n",
-       "      <td>22.00</td>\n",
+       "      <th>17</th>\n",
+       "      <td>trains</td>\n",
+       "      <td>20.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>40</th>\n",
+       "      <th>18</th>\n",
        "      <td>trolley</td>\n",
        "      <td>18.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>41</th>\n",
+       "      <th>19</th>\n",
        "      <td>shuttle</td>\n",
-       "      <td>16.00</td>\n",
+       "      <td>17.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>42</th>\n",
+       "      <th>20</th>\n",
        "      <td>locomotives</td>\n",
-       "      <td>15.00</td>\n",
+       "      <td>16.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>43</th>\n",
-       "      <td>rail network</td>\n",
+       "      <th>21</th>\n",
+       "      <td>vans</td>\n",
        "      <td>15.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>44</th>\n",
-       "      <td>transit agency</td>\n",
-       "      <td>14.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>45</th>\n",
+       "      <th>22</th>\n",
        "      <td>mobility hub</td>\n",
-       "      <td>13.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>46</th>\n",
-       "      <td>bus shelter</td>\n",
-       "      <td>13.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>47</th>\n",
-       "      <td>fare</td>\n",
-       "      <td>11.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>48</th>\n",
-       "      <td>bus rapid transit</td>\n",
-       "      <td>9.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>49</th>\n",
-       "      <td>transit signal priority</td>\n",
-       "      <td>8.00</td>\n",
+       "      <td>14.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>50</th>\n",
+       "      <th>23</th>\n",
        "      <td>fare collection</td>\n",
-       "      <td>7.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>51</th>\n",
-       "      <td>bus route</td>\n",
-       "      <td>6.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>52</th>\n",
-       "      <td>light rail vehicles</td>\n",
-       "      <td>5.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>53</th>\n",
-       "      <td>turnouts</td>\n",
-       "      <td>5.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>54</th>\n",
-       "      <td>commute</td>\n",
-       "      <td>5.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>55</th>\n",
-       "      <td>yarts</td>\n",
-       "      <td>4.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>56</th>\n",
-       "      <td>pedestrain</td>\n",
-       "      <td>4.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>57</th>\n",
-       "      <td>rider</td>\n",
-       "      <td>3.00</td>\n",
+       "      <td>8.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>58</th>\n",
+       "      <th>24</th>\n",
        "      <td>integrated transit</td>\n",
-       "      <td>3.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>59</th>\n",
-       "      <td>vessels</td>\n",
-       "      <td>3.00</td>\n",
+       "      <td>4.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>60</th>\n",
-       "      <td>public transportation</td>\n",
+       "      <th>25</th>\n",
+       "      <td>trolleys</td>\n",
        "      <td>2.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>61</th>\n",
+       "      <th>26</th>\n",
        "      <td>fare payment</td>\n",
        "      <td>2.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>62</th>\n",
-       "      <td>trolleys</td>\n",
-       "      <td>2.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>63</th>\n",
-       "      <td>bus network</td>\n",
-       "      <td>1.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>64</th>\n",
+       "      <th>27</th>\n",
        "      <td>farebox</td>\n",
        "      <td>1.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>65</th>\n",
-       "      <td>emus</td>\n",
-       "      <td>1.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>66</th>\n",
-       "      <td>cyclist</td>\n",
-       "      <td>1.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>67</th>\n",
-       "      <td>traffic reduction</td>\n",
-       "      <td>1.00</td>\n",
-       "    </tr>\n",
-       "    <tr>\n",
-       "      <th>68</th>\n",
-       "      <td>automated transit</td>\n",
+       "      <th>28</th>\n",
+       "      <td>fare technology</td>\n",
        "      <td>1.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>69</th>\n",
-       "      <td>ridesharing</td>\n",
+       "      <th>29</th>\n",
+       "      <td>shuttles</td>\n",
        "      <td>1.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>70</th>\n",
+       "      <th>30</th>\n",
        "      <td>mobility as a service</td>\n",
        "      <td>1.00</td>\n",
        "    </tr>\n",
        "    <tr>\n",
-       "      <th>71</th>\n",
-       "      <td>car share</td>\n",
+       "      <th>31</th>\n",
+       "      <td>automated transit</td>\n",
        "      <td>1.00</td>\n",
        "    </tr>\n",
        "  </tbody>\n",
@@ -2309,82 +2214,42 @@
        "</div>"
       ],
       "text/plain": [
-       "                    Keyword  Total Projects\n",
-       "0                      bike         2771.00\n",
-       "1                       bus         1578.00\n",
-       "2                pedestrian         1297.00\n",
-       "3                   transit          838.00\n",
-       "4                  sidewalk          719.00\n",
-       "5                     trail          533.00\n",
-       "6                   bicycle          466.00\n",
-       "7                   signals          453.00\n",
-       "8                       van          407.00\n",
-       "9                      rail          326.00\n",
-       "10                    buses          240.00\n",
-       "11                  station          212.00\n",
-       "12                 transfer          195.00\n",
-       "13                locations          194.00\n",
-       "14                 railroad          175.00\n",
-       "15                 facility          164.00\n",
-       "16      mobility management          134.00\n",
-       "17                crosswalk          103.00\n",
-       "18                      ped           99.00\n",
-       "19                 vehicles           89.00\n",
-       "20              paratransit           83.00\n",
-       "21               multimodal           73.00\n",
-       "22                rideshare           67.00\n",
-       "23                     srts           63.00\n",
-       "24               light rail           62.00\n",
-       "25                    ferry           61.00\n",
-       "26                   routes           60.00\n",
-       "27                 bus stop           55.00\n",
-       "28               congestion           53.00\n",
-       "29             bicycle lane           49.00\n",
-       "30    safe routes to school           43.00\n",
-       "31                    metro           42.00\n",
-       "32               intermodal           41.00\n",
-       "33                      brt           34.00\n",
-       "34                      atp           33.00\n",
-       "35                    train           31.00\n",
-       "36            park and ride           29.00\n",
-       "37              express bus           29.00\n",
-       "38            commuter rail           29.00\n",
-       "39                  vanpool           22.00\n",
-       "40                  trolley           18.00\n",
-       "41                  shuttle           16.00\n",
-       "42              locomotives           15.00\n",
-       "43             rail network           15.00\n",
-       "44           transit agency           14.00\n",
-       "45             mobility hub           13.00\n",
-       "46              bus shelter           13.00\n",
-       "47                     fare           11.00\n",
-       "48        bus rapid transit            9.00\n",
-       "49  transit signal priority            8.00\n",
-       "50          fare collection            7.00\n",
-       "51                bus route            6.00\n",
-       "52      light rail vehicles            5.00\n",
-       "53                 turnouts            5.00\n",
-       "54                  commute            5.00\n",
-       "55                    yarts            4.00\n",
-       "56               pedestrain            4.00\n",
-       "57                    rider            3.00\n",
-       "58       integrated transit            3.00\n",
-       "59                  vessels            3.00\n",
-       "60    public transportation            2.00\n",
-       "61             fare payment            2.00\n",
-       "62                 trolleys            2.00\n",
-       "63              bus network            1.00\n",
-       "64                  farebox            1.00\n",
-       "65                     emus            1.00\n",
-       "66                  cyclist            1.00\n",
-       "67        traffic reduction            1.00\n",
-       "68        automated transit            1.00\n",
-       "69              ridesharing            1.00\n",
-       "70    mobility as a service            1.00\n",
-       "71                car share            1.00"
+       "                  Keyword  Total Projects\n",
+       "0                     bus         1725.00\n",
+       "1                 transit          959.00\n",
+       "2                     van          408.00\n",
+       "3                    rail          375.00\n",
+       "4                   buses          259.00\n",
+       "5                 station          251.00\n",
+       "6                railroad          209.00\n",
+       "7             paratransit           86.00\n",
+       "8              multimodal           78.00\n",
+       "9              light rail           73.00\n",
+       "10                  ferry           64.00\n",
+       "11             intermodal           41.00\n",
+       "12                    brt           37.00\n",
+       "13                  train           34.00\n",
+       "14               terminal           31.00\n",
+       "15          commuter rail           30.00\n",
+       "16            express bus           29.00\n",
+       "17                 trains           20.00\n",
+       "18                trolley           18.00\n",
+       "19                shuttle           17.00\n",
+       "20            locomotives           16.00\n",
+       "21                   vans           15.00\n",
+       "22           mobility hub           14.00\n",
+       "23        fare collection            8.00\n",
+       "24     integrated transit            4.00\n",
+       "25               trolleys            2.00\n",
+       "26           fare payment            2.00\n",
+       "27                farebox            1.00\n",
+       "28        fare technology            1.00\n",
+       "29               shuttles            1.00\n",
+       "30  mobility as a service            1.00\n",
+       "31      automated transit            1.00"
       ]
      },
-     "execution_count": 153,
+     "execution_count": 62,
      "metadata": {},
      "output_type": "execute_result"
     }