diff --git a/project_list/_csis_utils.py b/project_list/_csis_utils.py new file mode 100644 index 000000000..5ecec155b --- /dev/null +++ b/project_list/_csis_utils.py @@ -0,0 +1,94 @@ +import pandas as pd +def csis_clean_project(df:pd.DataFrame)->pd.DataFrame: + df = df.fillna(df.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64': 0})) + string_cols = [ + 'needpurpose', + 'proj_desc', + 'route1', + 'title'] + # Clean strings + for i in string_cols: + df[i] = df[i].str.title().str.lstrip().str.rstrip() + df[i] = df[i].replace(r'\s+', ' ', regex=True) + + # Drop projects by ctips_id + df2 = df.drop_duplicates(subset = ['ctips_id']) + + # Filter out any rows where chg_qual1==7 because those are projects that are deleted + df2 = df2.loc[(df2.chg_qual1 != 7)] + df2 = df2.loc[(df2.archive == 0)] + df2 = df2.loc[(df2.document != "DSHOPP")] + df2 = df2.loc[(df2.chg_offcl != 14)] + df2 = df2.loc[(df2.chg_qual1 != 15)] + df2 = df2.loc[(df2.chg_qual1 != 16)] + df2 = df2.loc[(df2.chg_qual1 != 18)] + df2 = df2.loc[(df2.chg_qual1 != 20)] + df2 = df2.loc[(df2.chg_qual1 != 28)] + return df2 + +def add_agencies(left_df: pd.DataFrame, right_df: pd.DataFrame, col: str) -> pd.DataFrame: + merged_df = pd.merge( + left_df, + right_df, + left_on=col, + right_on='agencyid', + how='left' + ) + + renamed_df = merged_df.rename( + columns={ + 'agency_name_y': f'{col}_agency', + 'agencyid_x': 'agencyid', + 'agency_name_x': 'agency_name' + } + ) + + final_df = renamed_df.drop(columns=['agencyid_y']) + + return final_df + +def add_counties(left_df: pd.DataFrame, right_df: pd.DataFrame, col: str) -> pd.DataFrame: + merged_df = pd.merge( + left_df, + right_df, + left_on=col, + right_on='countyid', + how='left' + ) + + renamed_df = merged_df.rename( + columns={ + 'county_name_y': f'{col}_county', + 'countyid_x': 'countyid', + 'county_name_x': 'county_name' + } + ) + + final_df = renamed_df.drop(columns=['countyid_y', col]) + + return final_df + +def calculate_state_fed_local_total_funds(df:pd.DataFrame, fund_keywords:list, total_col_name:str)->pd.DataFrame: + selected_columns = [col for col in df.columns if any(keyword.lower() in col.lower() for keyword in fund_keywords)] + df[total_col_name] = df[selected_columns].fillna(0).sum(axis = 1) + return df + +def clean_political(df:pd.DataFrame, keyword_to_search:str)->pd.DataFrame: + my_list = [] + # Append a string to the list + my_list.append(keyword_to_search) + + filtered_columns = [col for col in df.columns if any(keyword.lower() in col.lower() for keyword in my_list)] + all_cols = filtered_columns + ['ctips_id'] + df2 = df[all_cols] + + # Make this from wide to long + df2 = pd.melt(df2, id_vars=['ctips_id'], value_vars=filtered_columns) + + # Clean up columns + df2.variable = df2.variable.str.replace(keyword_to_search, '') + df2 = df2.rename(columns = {'variable':keyword_to_search}) + + # Only keep relevant values for each project + df2 = df2.loc[df2.value == 1.0].reset_index(drop = True).drop(columns = ['value']) + return df2 \ No newline at end of file diff --git a/project_list/_database_utils.py b/project_list/_database_utils.py new file mode 100644 index 000000000..9aa964a97 --- /dev/null +++ b/project_list/_database_utils.py @@ -0,0 +1,22 @@ +def to_snakecase(df): + df.columns = df.columns.str.lower().str.replace(' ','_') + return df + +# Tag whether something is funded by state/federal/both +def is_state_funds(row): + if row.total_state_funds > 0: + return "Yes" + else: + return "No" + +def is_fed_funds(row): + if row.total_federal_funds > 0: + return "Yes" + else: + return "No" + +def is_local_funds(row): + if row.total_local_funds > 0: + return "Yes" + else: + return "No" \ No newline at end of file diff --git a/project_list/_harmonization_utils.py b/project_list/_harmonization_utils.py index 1931c02cd..ae4a32e20 100644 --- a/project_list/_harmonization_utils.py +++ b/project_list/_harmonization_utils.py @@ -12,7 +12,6 @@ def load_state_rail_plan(): df = srp_utils.clean_state_rail_plan(srp_utils.state_rail_plan_file) return df - def load_lost(): df = to_snakecase(pd.read_excel(f"{GCS_FILE_PATH}LOST/LOST_all_projects.xlsx", sheet_name = "Main")) @@ -20,10 +19,6 @@ def load_lost(): df.estimated_lost_funds = df.estimated_lost_funds* 1_000_000 return df - -def load_sb1(): - return sb1_utils.sb1_final() - """ Harmonizing Functions diff --git a/project_list/add_LRTP_congestion.ipynb b/project_list/add_LRTP_congestion.ipynb index 3666615fe..d597bfdfd 100644 --- a/project_list/add_LRTP_congestion.ipynb +++ b/project_list/add_LRTP_congestion.ipynb @@ -241,32 +241,32 @@ " \n", " \n", " \n", - " 24\n", - " $ 2.35 million\n", - " 2350000.00\n", - " 25\n", + " 235\n", + " $ 0.077 million\n", + " 4812.50\n", + " 236\n", " \n", " \n", - " 45\n", - " $ 16.3 million\n", - " 16300000.00\n", - " 45\n", + " 67\n", + " $1.5 million\n", + " 750000.00\n", + " 69\n", " \n", " \n", - " 51\n", - " $ 7.5 million\n", - " 7500000.00\n", - " 49\n", + " 216\n", + " $0.006 million\n", + " 5760.00\n", + " 217\n", " \n", " \n", "\n", "" ], "text/plain": [ - " fund_estimate total_project_cost rtp_id\n", - "24 $ 2.35 million 2350000.00 25\n", - "45 $ 16.3 million 16300000.00 45\n", - "51 $ 7.5 million 7500000.00 49" + " fund_estimate total_project_cost rtp_id\n", + "235 $ 0.077 million 4812.50 236\n", + "67 $1.5 million 750000.00 69\n", + "216 $0.006 million 5760.00 217" ] }, "execution_count": 9, @@ -690,14 +690,14 @@ " \n", " \n", " \n", - " 80\n", - " Capacity Increasing\n", + " 18\n", + " Proposed Improvements\n", + " Lemoore\n", " NaN\n", - " 41\n", - " 3.8/6.4\n", - " Avenal Creek to s/o SR 33\n", - " Construct Passing Lanes\n", " NaN\n", + " Spring Lane\n", + " 100 ft. east of Beverly Dr.\n", + " Overlay\n", " None\n", " 0\n", " \n", @@ -706,14 +706,11 @@ "" ], "text/plain": [ - " category jurisdiction state_route post_mile \\\n", - "80 Capacity Increasing NaN 41 3.8/6.4 \n", - "\n", - " location project_limits description title \\\n", - "80 Avenal Creek to s/o SR 33 Construct Passing Lanes NaN None \n", + " category jurisdiction state_route post_mile location \\\n", + "18 Proposed Improvements Lemoore NaN NaN Spring Lane \n", "\n", - " total_cost \n", - "80 0 " + " project_limits description title total_cost \n", + "18 100 ft. east of Beverly Dr. Overlay None 0 " ] }, "execution_count": 19, @@ -947,14 +944,14 @@ " \n", " \n", " \n", - " 11\n", + " 42\n", " Proposed Improvements\n", - " Hanford\n", + " Lemoore\n", " NaN\n", " NaN\n", - " 13th Avenue\n", - " Houston Ave. to Lacey Blvd.\n", - " Widen from 2 to 4 lanes with median\n", + " Magnolia Street\n", + " Lemoore Ave. to Smith Ave.\n", + " Overlay\n", " None\n", " 0\n", " \n", @@ -963,14 +960,11 @@ "" ], "text/plain": [ - " category jurisdiction state_route post_mile location \\\n", - "11 Proposed Improvements Hanford NaN NaN 13th Avenue \n", - "\n", - " project_limits description title \\\n", - "11 Houston Ave. to Lacey Blvd. Widen from 2 to 4 lanes with median None \n", + " category jurisdiction state_route post_mile location \\\n", + "42 Proposed Improvements Lemoore NaN NaN Magnolia Street \n", "\n", - " total_cost \n", - "11 0 " + " project_limits description title total_cost \n", + "42 Lemoore Ave. to Smith Ave. Overlay None 0 " ] }, "execution_count": 24, @@ -1037,39 +1031,39 @@ " \n", " \n", " \n", - " 761\n", - " Kern County ‐ Class II Bike Ln\n", - " Houghton Rd (Old River Rd to Union Av) 6. mi. $ 543\n", - " 543000\n", - " 543000\n", + " 360\n", + " Metro Passenger Rail\n", + " Amtrak Station ‐ Phase II $ 13,000\n", + " 13000000\n", + " 13000000\n", " NaN\n", - " 543000\n", + " 13000000\n", " \n", " \n", - " 327\n", - " Route 99\n", - " Rt 99 @ Minkler Spur ‐ construct grade separation\n", - " 52152000\n", - " 69000000\n", - " 16848000\n", - " 52152000\n", + " 1126\n", + " Mendiburu Path / California City Blvd‐88\n", + " Class I Shared Use Path ‐ 1.6 mile ‐ Add new off‐St class I shared use path\n", + " 1445000\n", + " 1445000\n", + " NaN\n", + " 1445000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " project_title \\\n", - "761 Kern County ‐ Class II Bike Ln \n", - "327 Route 99 \n", + " project_title \\\n", + "360 Metro Passenger Rail \n", + "1126 Mendiburu Path / California City Blvd‐88 \n", "\n", - " scope \\\n", - "761 Houghton Rd (Old River Rd to Union Av) 6. mi. $ 543 \n", - "327 Rt 99 @ Minkler Spur ‐ construct grade separation \n", + " scope \\\n", + "360 Amtrak Station ‐ Phase II $ 13,000 \n", + "1126 Class I Shared Use Path ‐ 1.6 mile ‐ Add new off‐St class I shared use path \n", "\n", - " yoe_w__new_revenue yoe_w_o_new_reven maint__inflation_savings cost \n", - "761 543000 543000 NaN 543000 \n", - "327 52152000 69000000 16848000 52152000 " + " yoe_w__new_revenue yoe_w_o_new_reven maint__inflation_savings cost \n", + "360 13000000 13000000 NaN 13000000 \n", + "1126 1445000 1445000 NaN 1445000 " ] }, "execution_count": 26, @@ -1886,32 +1880,32 @@ " \n", " \n", " \n", - " 15\n", - " Gustine\n", - " Borelli Ranch Park Multi-use Path\n", - " Construct a Multi-use Path from Fentem Rd to the end of Via Palermo\n", - " Active (Bike/Ped)\n", - " 2030\n", - " 450\n", - " CMAQ, Local, Measure V\n", - " 450000\n", + " 54\n", + " Merced\n", + " SR-59 Merced Widening Phase 4\n", + " Widen 2 to 4 lanes from Cardella Rd to Bellevue Rd\n", + " Road Capacity\n", + " 2045\n", + " 30000\n", + " SB-1, Measure V, Local, SHOPP\n", + " 30000000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " agency title \\\n", - "15 Gustine Borelli Ranch Park Multi-use Path \n", + " agency title \\\n", + "54 Merced SR-59 Merced Widening Phase 4 \n", "\n", - " limits_description \\\n", - "15 Construct a Multi-use Path from Fentem Rd to the end of Via Palermo \n", + " limits_description type \\\n", + "54 Widen 2 to 4 lanes from Cardella Rd to Bellevue Rd Road Capacity \n", "\n", - " type completion\\nyear total_cost\\n_$1,000s_ \\\n", - "15 Active (Bike/Ped) 2030 450 \n", + " completion\\nyear total_cost\\n_$1,000s_ funding_sources \\\n", + "54 2045 30000 SB-1, Measure V, Local, SHOPP \n", "\n", - " funding_sources total_cost_millions \n", - "15 CMAQ, Local, Measure V 450000 " + " total_cost_millions \n", + "54 30000000 " ] }, "execution_count": 39, @@ -2135,37 +2129,37 @@ " \n", " \n", " \n", - " 1452\n", - " YOL17360\n", - " Project Development Only\n", - " YOL\n", - " City of Woodland\n", + " 723\n", + " SAC24111\n", + " Programmed\n", + " SAC\n", + " City of Elk Grove\n", " B- Road & Highway Capacity\n", - " Parkland Ave.\n", - " Construct New Road: 2 lane arterial from Pioneer Ave. to East St.\n", - " 9044751\n", + " Lotz Parkway\n", + " In Elk Grove, Lotz Parkway from Whitelock Parkway to Poppy Ridge Road: Construct new 4-lane roadway; and Lotz Parkway from Poppy Ridge Road to\\n0.5 miles south of Whitelock Pkwy at the northern boundary of the Sterling\\nMeadows development area: Construct new 2-lane roadway.\n", + " 8662500\n", " NaN\n", - " Post-2040\n", + " 2020-2025\n", " \n", " \n", "\n", "" ], "text/plain": [ - " id status__planned,_programmed_or_project_development_only_ \\\n", - "1452 YOL17360 Project Development Only \n", + " id status__planned,_programmed_or_project_development_only_ county \\\n", + "723 SAC24111 Programmed SAC \n", "\n", - " county lead_agency budget_category title \\\n", - "1452 YOL City of Woodland B- Road & Highway Capacity Parkland Ave. \n", + " lead_agency budget_category title \\\n", + "723 City of Elk Grove B- Road & Highway Capacity Lotz Parkway \n", "\n", - " description \\\n", - "1452 Construct New Road: 2 lane arterial from Pioneer Ave. to East St. \n", + " description \\\n", + "723 In Elk Grove, Lotz Parkway from Whitelock Parkway to Poppy Ridge Road: Construct new 4-lane roadway; and Lotz Parkway from Poppy Ridge Road to\\n0.5 miles south of Whitelock Pkwy at the northern boundary of the Sterling\\nMeadows development area: Construct new 2-lane roadway. \n", "\n", - " total_project_cost__2018_dollars_ \\\n", - "1452 9044751 \n", + " total_project_cost__2018_dollars_ \\\n", + "723 8662500 \n", "\n", - " year_of_expenditure_cost_for_planned_projects completion_timing \n", - "1452 NaN Post-2040 " + " year_of_expenditure_cost_for_planned_projects completion_timing \n", + "723 NaN 2020-2025 " ] }, "execution_count": 51, @@ -2482,29 +2476,35 @@ " \n", " \n", " \n", - " 197\n", - " Planned\n", - " C-PL-6: Franklin Creek Multiuse Path\n", - " Construction\n", - " Construct a multiuse path along Franklin Creek from Carpinteria Ave to 7th St.\n", + " 367\n", + " VMT Reducing\n", + " SB-PL-15: Upper De la Vina St Gap Closure and Safe Crossings\n", + " PA&ED, PS&E,\\nConstruction\n", + " Implement a road diet on De La Vina Street from Constance Avenue to Padre Street. Crossing enhancements included.\n", " NaN\n", - " 2023\n", - " 750\n", - " 750000\n", + " 2050\n", + " 1988\n", + " 1988000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " type project_title phase \\\n", - "197 Planned C-PL-6: Franklin Creek Multiuse Path Construction \n", + " type \\\n", + "367 VMT Reducing \n", + "\n", + " project_title \\\n", + "367 SB-PL-15: Upper De la Vina St Gap Closure and Safe Crossings \n", "\n", - " description \\\n", - "197 Construct a multiuse path along Franklin Creek from Carpinteria Ave to 7th St. \n", + " phase \\\n", + "367 PA&ED, PS&E,\\nConstruction \n", + "\n", + " description \\\n", + "367 Implement a road diet on De La Vina Street from Constance Avenue to Padre Street. Crossing enhancements included. \n", "\n", " primary_funding_source_s_ year total_cost__$000s_ total_cost_millions \n", - "197 NaN 2023 750 750000 " + "367 NaN 2050 1988 1988000 " ] }, "execution_count": 57, @@ -3010,7 +3010,7 @@ }, { "cell_type": "code", - "execution_count": 74, + "execution_count": 69, "id": "e3f73aaa-91e9-4290-8335-ac961215b1c9", "metadata": {}, "outputs": [ @@ -3052,75 +3052,51 @@ " \n", " \n", " \n", - " 593\n", - " PASSENGER RAIL\n", - " CALIFORNIA HIGH SPEED RAIL AUTHORITY\n", - " 1TR1012\n", + " 812\n", + " TRANSIT\n", + " LOS ANGELES COUNTY MTA (METRO)\n", + " 1TL0703\n", " 0\n", - " CALIFORNIA HIGH-\\nSPEED RAIL\n", - " NaN\n", + " METRO RAIL TRANSIT CAPITAL\n", + " COUNTYWIDE\n", " NaN\n", - " CALIFORNIA HIGH-SPEED RAIL PHASE 1 - ENV/PE\n", - " 2021\n", - " 332000\n", + " RAIL CAPITAL PROJECTS\n", + " 2040\n", + " 19151000\n", " NaN\n", " No Title\n", - " 332000000\n", - " \n", - " \n", - " 2941\n", - " PASSENGER RAIL\n", - " CHSRA\n", - " 7120010\n", - " 0\n", - " CALIFORNIA HIGH-\\nSPEED RAIL\n", - " REGIONWIDE\n", - " NaN\n", - " CALIFORNIA HIGH-SPEED RAIL - PHASE 1 (INCLUDES METROLINK AND\\nLOSSAN CORRIDOR SPEED UPGRADES)\n", - " 2033\n", - " 38960000\n", - " NaN\n", - " No Title\n", - " 38960000000\n", + " 19151000000\n", " \n", " \n", "\n", "" ], "text/plain": [ - " system lead_agency rtp_id route_# \\\n", - "593 PASSENGER RAIL CALIFORNIA HIGH SPEED RAIL AUTHORITY 1TR1012 0 \n", - "2941 PASSENGER RAIL CHSRA 7120010 0 \n", - "\n", - " route_name from to \\\n", - "593 CALIFORNIA HIGH-\\nSPEED RAIL NaN NaN \n", - "2941 CALIFORNIA HIGH-\\nSPEED RAIL REGIONWIDE NaN \n", + " system lead_agency rtp_id route_# \\\n", + "812 TRANSIT LOS ANGELES COUNTY MTA (METRO) 1TL0703 0 \n", "\n", - " description \\\n", - "593 CALIFORNIA HIGH-SPEED RAIL PHASE 1 - ENV/PE \n", - "2941 CALIFORNIA HIGH-SPEED RAIL - PHASE 1 (INCLUDES METROLINK AND\\nLOSSAN CORRIDOR SPEED UPGRADES) \n", + " route_name from to description \\\n", + "812 METRO RAIL TRANSIT CAPITAL COUNTYWIDE NaN RAIL CAPITAL PROJECTS \n", "\n", - " completion_year project_cost__$1,000s_ county project_title \\\n", - "593 2021 332000 NaN No Title \n", - "2941 2033 38960000 NaN No Title \n", + " completion_year project_cost__$1,000s_ county project_title \\\n", + "812 2040 19151000 NaN No Title \n", "\n", - " project_cost_millions \n", - "593 332000000 \n", - "2941 38960000000 " + " project_cost_millions \n", + "812 19151000000 " ] }, - "execution_count": 74, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "scag.loc[scag.description.str.contains(\"California High-Speed Rail\", case=False)]\n" + "scag.loc[scag.description.str.contains(\"Rail Capital Projects\", case=False)]\n" ] }, { "cell_type": "code", - "execution_count": 75, + "execution_count": 70, "id": "78be03f0-de29-48f2-a1d6-3506c57081d5", "metadata": {}, "outputs": [ @@ -3130,7 +3106,7 @@ "38960000000" ] }, - "execution_count": 75, + "execution_count": 70, "metadata": {}, "output_type": "execute_result" } diff --git a/project_list/add_ctips_02_21_2024.ipynb b/project_list/add_ctips_02_21_2024.ipynb new file mode 100644 index 000000000..97cc968de --- /dev/null +++ b/project_list/add_ctips_02_21_2024.ipynb @@ -0,0 +1,2402 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "1e57c806", + "metadata": {}, + "source": [ + "## CTIPS\n", + "* https://ctips-prod.dot.ca.gov/ctips/LoginMediatorForm.do\n", + "\n", + "### To do\n", + "* Get total cost\n", + " * total_cost: The total cost of this project.\n", + " Total project cost can be calculated using 3 tables: project, fundtype, and fundline\n", + " AH: which columns do I use from fundtype, project, and fundline to calculate the total cost?\n", + " You can calculate total programmed for a project using: fundline.action = P and project.high_offlc = 1\n", + " Then sum ( fundline.pe_paed + fundline.pe_env + fundline.pe_rw + fundline.pe_con + fundline.rw + fundline.con )\n", + "\n", + "* Ask if DSHOPP means draft SHOPP project\n", + "* PROJSCHE - not a lot of matches" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "331fca6b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd \n", + "import sqlalchemy \n", + "import sys \n", + "import re\n", + "import oracledb \n", + "import _database_utils as _utils \n", + "import _csis_utils" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cdf61f63", + "metadata": {}, + "outputs": [], + "source": [ + "oracledb.version = \"8.3.0\" \n", + "sys.modules[\"cx_Oracle\"] = oracledb " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b510d7b1", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 400\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "06b673b3", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "ENGINE_PATH_WIN_AUTH = f\"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}\" " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7525e19c", + "metadata": {}, + "outputs": [], + "source": [ + "engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH) " + ] + }, + { + "cell_type": "markdown", + "id": "9a6c14bd", + "metadata": {}, + "source": [ + "## Project Base Table\n", + "### Project\n", + "Project.agencyid = project sponsor\n", + "\n", + "Implpaed = Implementing Agency for PA&ED\n", + "\n", + "Implpse = Implementing Agency for PS&E\n", + "\n", + "implcon = Implementing Agency for Construction\n", + "\n", + "implrw = Implementing Agency for Right of Way\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "87510d9c", + "metadata": {}, + "outputs": [], + "source": [ + "projects_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "ctips_id,\n", + "appdate, \n", + "archive,\n", + "agencyid,\n", + "const_date,\n", + "countyid,\n", + "countyid2,\n", + "countyid3,\n", + "chg_offcl,\n", + "chg_qual1,\n", + "chg_qual2,\n", + "districtid,\n", + "document,\n", + "docyear,\n", + "ea_number,\n", + "high_ver,\n", + "high_offcl,\n", + "implpaed, \n", + "implpse, \n", + "implrw, \n", + "implcon, \n", + "needpurpose,\n", + "progcode1,\n", + "ppno,\n", + "proj_desc,\n", + "postmiles1,\n", + "pm1b,\n", + "pm2b,\n", + "pm3b,\n", + "pm1a,\n", + "pm2a,\n", + "pm3a,\n", + "projcomp_date,\n", + "projectid,\n", + "route1,\n", + "route2,\n", + "route3,\n", + "rtl,\n", + "title,\n", + "version\n", + "FROM ctips.project\n", + "ORDER BY high_ver DESC, version DESC, high_offcl DESC\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6e6fb595", + "metadata": {}, + "outputs": [], + "source": [ + "projects_df.document.unique()" + ] + }, + { + "cell_type": "markdown", + "id": "1fc58d5e", + "metadata": {}, + "source": [ + "#### DOUBLE check filtering on document" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "de82322c", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table = _csis_utils.csis_clean_project(projects_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "05f51a07", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.document.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "32ae2bab", + "metadata": {}, + "outputs": [], + "source": [ + "len(projects_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1a634f44", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.ctips_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "4adbcc27", + "metadata": {}, + "source": [ + "### PROJSCHE" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "096e0ffd", + "metadata": {}, + "outputs": [], + "source": [ + "projsche_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "projectid,\n", + "m020 AS pa_ed_begin,\n", + "m200a AS pa_ed_end,\n", + "m200b AS ps_e_begin,\n", + "m224 AS begin_row,\n", + "m410 AS end_row,\n", + "m500 AS con_start_date,\n", + "m600 AS con_end_date,\n", + "m700 AS begin_closeout,\n", + "m800 AS end_closeout\n", + "FROM ctips.projsche\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0c32c036", + "metadata": {}, + "outputs": [], + "source": [ + "projsche_drop_cols = list(projsche_df.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8a85af0f", + "metadata": {}, + "outputs": [], + "source": [ + "projsche_drop_cols.remove('projectid')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "acb324e6", + "metadata": {}, + "outputs": [], + "source": [ + "# I want to drop the rows in which ALL values in the date columns are empty\n", + "projsche_df2 = projsche_df.dropna(how = \"all\", subset = projsche_drop_cols).reset_index(drop = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc6578cc", + "metadata": {}, + "outputs": [], + "source": [ + "len(projsche_df2), len(projsche_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fef2ed85", + "metadata": {}, + "outputs": [], + "source": [ + "projsche_df2.projectid.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c613a019", + "metadata": {}, + "outputs": [], + "source": [ + "projsche_df2.projectid.value_counts().head()" + ] + }, + { + "cell_type": "markdown", + "id": "320571a7", + "metadata": {}, + "source": [ + "#### Not a lot of matching values" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "311506d2", + "metadata": {}, + "outputs": [], + "source": [ + "pd.merge(projsche_df2, projects_table, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c6def815", + "metadata": {}, + "outputs": [], + "source": [ + "phase_dates_df = pd.merge(projects_table[['ctips_id', 'projectid']], projsche_df2, on ='projectid', how = 'inner')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68d8a80c", + "metadata": {}, + "outputs": [], + "source": [ + "phase_dates_df.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe1013ec", + "metadata": {}, + "outputs": [], + "source": [ + "phase_dates_df.head()" + ] + }, + { + "cell_type": "markdown", + "id": "6aee9af0", + "metadata": {}, + "source": [ + "### AGENCY" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fda0b02a", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "agency_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "name AS agency_name,\n", + "agencyid\n", + "FROM ctips.agncy\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68229e79", + "metadata": {}, + "outputs": [], + "source": [ + "agency_df.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d3e86563", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table = pd.merge(projects_table, agency_df, on ='agencyid', how = 'left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d6177b3d", + "metadata": {}, + "outputs": [], + "source": [ + "phase_agency_cols = ['implpaed','implpse','implrw', 'implcon']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "197380b2", + "metadata": {}, + "outputs": [], + "source": [ + "for i in phase_agency_cols:\n", + " projects_table = _csis_utils.add_agencies(projects_table, agency_df, i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34b5bffe", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f0675e4", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table = projects_table.drop(columns = phase_agency_cols)" + ] + }, + { + "cell_type": "markdown", + "id": "f39f398c", + "metadata": {}, + "source": [ + "### COUNTY" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "72df0bb3", + "metadata": {}, + "outputs": [], + "source": [ + "county_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "name AS county_name,\n", + "countyid\n", + "FROM ctips.county\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a3be2d02", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df = pd.merge(projects_table[['ctips_id','countyid', 'countyid2', 'countyid3']], county_df, on ='countyid', how = 'left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "43179a66", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "421e9af2", + "metadata": {}, + "outputs": [], + "source": [ + "countyid_cols = ['countyid2', 'countyid3']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "74dfdb02", + "metadata": {}, + "outputs": [], + "source": [ + "for i in countyid_cols:\n", + " county_projects_df = _csis_utils.add_counties(county_projects_df, county_df, i)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ffaf0f1e", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df = county_projects_df[['ctips_id', 'county_name', 'countyid2_county', 'countyid3_county']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e617a674", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a36adee5", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Melt this from wide to long\n", + "county_projects_df2 = pd.melt(county_projects_df, id_vars=['ctips_id'], value_vars=['county_name','countyid2_county','countyid3_county'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1597b8d", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df2.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4311b58e", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df2 = county_projects_df2.sort_values(by = ['ctips_id']).dropna().reset_index(drop = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adc68256", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df2 = county_projects_df2.drop(columns = ['variable']).rename(columns = {'value':'county'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb00e398", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df2.ctips_id.value_counts().describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa7b2579", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df2.loc[county_projects_df2.ctips_id == 10600002937]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2f1d8ea6", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table = projects_table.drop(columns = countyid_cols)" + ] + }, + { + "cell_type": "markdown", + "id": "87d196cc", + "metadata": {}, + "source": [ + "### FUNDLINE\n", + "* For action: Action: P = programmed, V= vote, A=award" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7fc7047a", + "metadata": {}, + "outputs": [], + "source": [ + "fundline_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + " action,\n", + " con,\n", + " rw,\n", + " pe_paed,\n", + " pe_env,\n", + " pe_rw,\n", + " pe_con,\n", + " pe_total,\n", + " fundlineid,\n", + " fundtypeid,\n", + " line_year,\n", + " actiondate\n", + "FROM ctips.fundline\n", + "WHERE action = 'P'\n", + "\"\"\", engine)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cc6ac7b6", + "metadata": {}, + "outputs": [], + "source": [ + "fundline_df.action.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "d38f6532", + "metadata": {}, + "source": [ + "### Fundtype\n", + "* Fundtype.agencyid = funding agency" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "03ad06ed", + "metadata": {}, + "outputs": [], + "source": [ + "fundtype_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "fundtypeid,\n", + "fundid,\n", + "progcode,\n", + "programid,\n", + "projectid,\n", + "agencyid\n", + "FROM ctips.fundtype\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "markdown", + "id": "471153ad", + "metadata": {}, + "source": [ + "#### Merge everything" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2242bbf2", + "metadata": {}, + "outputs": [], + "source": [ + "pd.merge(fundtype_df,\n", + " fundline_df, \n", + " on = ['fundtypeid'], \n", + " how = \"outer\",\n", + " indicator = True,)[['_merge']].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "56d38a60", + "metadata": {}, + "outputs": [], + "source": [ + "pd.merge(fundtype_df,fundline_df, on = ['fundtypeid'], how = \"outer\", indicator = True)[['_merge']].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc6f24e1", + "metadata": {}, + "outputs": [], + "source": [ + "fund_m1 = pd.merge(fundtype_df,fundline_df, on = ['fundtypeid'], how = \"left\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8939f75b", + "metadata": {}, + "outputs": [], + "source": [ + "final_fund_m = pd.merge(projects_table[['projectid','ctips_id', 'document']], fund_m1, on = ['projectid'], how = \"inner\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "51fa4c0f", + "metadata": {}, + "outputs": [], + "source": [ + "final_fund_m.ctips_id.nunique(), len(final_fund_m)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a2301d5a", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "42f5bd20", + "metadata": {}, + "outputs": [], + "source": [ + "29152-29116" + ] + }, + { + "cell_type": "markdown", + "id": "0722f47b", + "metadata": {}, + "source": [ + "### Progmain" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f612c3ba", + "metadata": {}, + "outputs": [], + "source": [ + "progmain_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "programid,\n", + "category AS program\n", + "FROM ctips.progmain\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "markdown", + "id": "dae62ff2", + "metadata": {}, + "source": [ + "### Fund" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "abeb1cf9", + "metadata": {}, + "outputs": [], + "source": [ + "fund_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "fund,\n", + "fundid,\n", + "type AS fund_type_1_fed_2_state_3_local\n", + "FROM ctips.fund\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "markdown", + "id": "ae1f680e", + "metadata": {}, + "source": [ + "### Progsub\n", + "* Some progcodes have more than one progdesc\n", + "* Dropped duplicates bc the progdesc are similar\n", + "double_ids = ['20.30.010.820',\n", + " '20.XX.723.000',\n", + " '20.30.010.810',\n", + " '20.XX.720.100',\n", + " '20.30.010.817',\n", + " '20.30.210.200'\n", + " ]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2f0bcec", + "metadata": {}, + "outputs": [], + "source": [ + "progsub_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "progcode,\n", + "progdesc\n", + "FROM ctips.progsub\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88794e92", + "metadata": {}, + "outputs": [], + "source": [ + "progsub_df2 = progsub_df.drop_duplicates(subset = ['progcode'])" + ] + }, + { + "cell_type": "markdown", + "id": "236a5e98", + "metadata": {}, + "source": [ + "### Merge for work below" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e438665d", + "metadata": {}, + "outputs": [], + "source": [ + "final_fund_m.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d7341c6", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info = ((final_fund_m\n", + " .merge(progmain_df, on = ['programid'], how = \"left\")\n", + " .merge(fund_df, on =['fundid'], how = \"left\")\n", + " .merge(progsub_df2, on = ['progcode'], how = \"left\"))\n", + " .drop(columns = ['fundid','programid', 'progcode']))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8efa77b", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info.fund.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fe4ec8c0", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info['fund'] = funding_w_program_info.fund + '-' + funding_w_program_info.fund_type_1_fed_2_state_3_local.astype('str')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "69499228", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info = funding_w_program_info.fillna(funding_w_program_info.dtypes.replace({'float64': 0.0, 'object': 'None'}))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "29d0d6f8", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0311f5c0", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c6ae432", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info.action.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "b203d580", + "metadata": {}, + "source": [ + "#### Filter out programmed temporarily for now." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea7a292d", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info = funding_w_program_info.loc[funding_w_program_info.action == \"P\"].reset_index(drop = True)" + ] + }, + { + "cell_type": "markdown", + "id": "0920e3c2", + "metadata": {}, + "source": [ + "## Phase Funding Table" + ] + }, + { + "cell_type": "markdown", + "id": "03e10f95", + "metadata": {}, + "source": [ + "#### First: find the # of funds a project has programmed/voted/awarded for each fund" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b4bde774", + "metadata": {}, + "outputs": [], + "source": [ + "columns_to_agg = {**dict.fromkeys(['con', 'rw',\n", + " 'pe_paed', 'pe_env', 'pe_rw', 'pe_con', 'pe_total'], 'sum')}\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "531ee9f5", + "metadata": {}, + "outputs": [], + "source": [ + "columns_to_agg" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "11b5f191", + "metadata": {}, + "outputs": [], + "source": [ + "# I want to find the total funds a project will receive for each fund\n", + "total_cost = funding_w_program_info.groupby(['ctips_id','fund','document']).agg(columns_to_agg).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "17396ff5", + "metadata": {}, + "outputs": [], + "source": [ + "total_cost.sample()" + ] + }, + { + "cell_type": "markdown", + "id": "f6059d2f", + "metadata": {}, + "source": [ + "##### Separate out FTIP and everything else to calculate total funds a project is estimated to receive" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c2c6638f", + "metadata": {}, + "outputs": [], + "source": [ + "# Calculate out FTIP and oither documents in 2 stages\n", + "ftip_only = total_cost.loc[total_cost.document.isin(['FTIP','DFTIP' ])].reset_index(drop = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "48c83f88", + "metadata": {}, + "outputs": [], + "source": [ + "total_cost_ftip = ftip_only.groupby(['ctips_id', 'fund']).agg(columns_to_agg).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8bd716d", + "metadata": {}, + "outputs": [], + "source": [ + "cols_to_keep = ['ctips_id', 'fund', 'total_cost']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e613cf0", + "metadata": {}, + "outputs": [], + "source": [ + "total_cost_ftip['total_cost'] = total_cost_ftip.con + total_cost_ftip.rw + total_cost_ftip.pe_total" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f74a386e", + "metadata": {}, + "outputs": [], + "source": [ + "total_cost_ftip = total_cost_ftip[cols_to_keep]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1118b944", + "metadata": {}, + "outputs": [], + "source": [ + "everything_else = total_cost.loc[~total_cost.document.isin(['FTIP','DFTIP'])].reset_index(drop = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3070d228", + "metadata": {}, + "outputs": [], + "source": [ + "everything_else = everything_else.groupby(['ctips_id', 'fund']).agg(columns_to_agg).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d5ad472a", + "metadata": {}, + "outputs": [], + "source": [ + "everything_else['total_cost'] = everything_else.con + everything_else.rw + everything_else.pe_paed + everything_else.pe_env + everything_else.pe_rw + everything_else.pe_con" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e24f20fb", + "metadata": {}, + "outputs": [], + "source": [ + "everything_else = everything_else[cols_to_keep]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f442ffe1", + "metadata": {}, + "outputs": [], + "source": [ + "total_requested_funds_final = pd.concat([everything_else, total_cost_ftip])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9c2d01f9", + "metadata": {}, + "outputs": [], + "source": [ + "len(total_requested_funds_final), total_requested_funds_final.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f48aed8", + "metadata": {}, + "outputs": [], + "source": [ + "total_requested_funds_final.ctips_id.value_counts().describe()" + ] + }, + { + "cell_type": "markdown", + "id": "cd9c7213", + "metadata": {}, + "source": [ + "##### One project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "36c4d445", + "metadata": {}, + "outputs": [], + "source": [ + "total_requested_funds_final.loc[total_requested_funds_final.ctips_id == 20300000209]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f4c616bd", + "metadata": {}, + "outputs": [], + "source": [ + "8900.00+63400.00" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "aa2dbaff", + "metadata": {}, + "outputs": [], + "source": [ + "9700.00+1100.00" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5e7b5497", + "metadata": {}, + "outputs": [], + "source": [ + "# Original \n", + "funding_w_program_info.loc[funding_w_program_info.ctips_id == 20300000209]" + ] + }, + { + "cell_type": "markdown", + "id": "6921ff42", + "metadata": {}, + "source": [ + "#### Pivot - I want the dataframe to be wide instead of long" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e73a815", + "metadata": {}, + "outputs": [], + "source": [ + "fund_table = total_requested_funds_final.pivot_table(index=['ctips_id'], columns='fund', \n", + " values=['total_cost'], aggfunc='sum')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "369e76a8", + "metadata": {}, + "outputs": [], + "source": [ + "fund_table.columns = fund_table.columns.droplevel()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8da306ee", + "metadata": {}, + "outputs": [], + "source": [ + "fund_table = fund_table.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "089794b7", + "metadata": {}, + "outputs": [], + "source": [ + "fund_table.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0bba2cfa", + "metadata": {}, + "outputs": [], + "source": [ + "fund_table = _utils.to_snakecase(fund_table)" + ] + }, + { + "cell_type": "markdown", + "id": "dd91c841", + "metadata": {}, + "source": [ + "#### Second: find the amount of $ for each phase" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "08582661", + "metadata": {}, + "outputs": [], + "source": [ + "cost_per_phase = funding_w_program_info.groupby(['ctips_id']).agg(columns_to_agg).reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b362f3c0", + "metadata": {}, + "outputs": [], + "source": [ + "cost_per_phase.shape, cost_per_phase.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "439f1946", + "metadata": {}, + "outputs": [], + "source": [ + "63400.00+9700.00+11300" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9e885972", + "metadata": {}, + "outputs": [], + "source": [ + "fund_table.loc[fund_table.ctips_id == 20300000209].dropna(axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b1c458ea", + "metadata": {}, + "outputs": [], + "source": [ + "cost_per_phase.loc[cost_per_phase.ctips_id == 20300000209].dropna(axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "08564b97", + "metadata": {}, + "source": [ + "#### Third: merge these 2 tables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "41e08290", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table = pd.merge(fund_table, cost_per_phase, on = [\"ctips_id\"], how = \"inner\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "79f2f2f2", + "metadata": {}, + "outputs": [], + "source": [ + "len(final_phase_funding_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2c33f3ae", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "97ad053f", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table.ctips_id.nunique(), len(final_phase_funding_table)" + ] + }, + { + "cell_type": "markdown", + "id": "a824983c", + "metadata": {}, + "source": [ + "#### Fourth: find state v federal \n", + "* State funds is a lot more." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b3471a77", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "federal_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['1.0'], 'total_federal_funds')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4b164f51", + "metadata": {}, + "outputs": [], + "source": [ + "state_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['2.0'], 'total_state_funds')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "356faf47", + "metadata": {}, + "outputs": [], + "source": [ + "local_funds = _csis_utils.calculate_state_fed_local_total_funds(final_phase_funding_table, ['3.0'], 'total_local_funds')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1c0975b2", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table[\"is_state\"] = final_phase_funding_table.apply(_utils.is_state_funds, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14e14a4c", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table[\"is_federal\"] = final_phase_funding_table.apply(_utils.is_fed_funds, axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a125f0f", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table[\"is_local\"] = final_phase_funding_table.apply(_utils.is_local_funds, axis=1)" + ] + }, + { + "cell_type": "markdown", + "id": "f83111de", + "metadata": {}, + "source": [ + "##### Check that I summed up federal funds correctly\n", + "* State not summing up correctly" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f9609041", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info.loc[funding_w_program_info.ctips_id == 20920011849][['fund_type_1_fed_2_state_3_local']].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7134d80d", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['con']].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "421b4f27", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['rw']].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c24b6f55", + "metadata": {}, + "outputs": [], + "source": [ + "funding_w_program_info.loc[(funding_w_program_info.ctips_id == 20920011849) & (funding_w_program_info.fund == 'Public Transportation Modernization Improvement-2.0-2.0')][['pe_total']].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6671324b", + "metadata": {}, + "outputs": [], + "source": [ + "112386000.00+12213000.00+4351000.00 == 128950000.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14d0f887", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table.loc[final_phase_funding_table.ctips_id == 20920011849].dropna(axis=1).T" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9a1bbfda", + "metadata": {}, + "outputs": [], + "source": [ + "(final_phase_funding_table\n", + " .groupby(['is_state', 'is_federal', 'is_local'])\n", + " .agg({'ctips_id':'nunique'})\n", + " .reset_index()\n", + " .sort_values(by = ['ctips_id']))" + ] + }, + { + "cell_type": "markdown", + "id": "ded58f7d", + "metadata": {}, + "source": [ + "#### Fifth: Drop everything before `con`\n", + "* Need to differentiate between `pe_total` for FTIP vs `pe_total` for everything else." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e10c8f52", + "metadata": {}, + "outputs": [], + "source": [ + "to_keep = ['ctips_id','con','rw', 'pe_env', 'pe_rw', 'pe_con', 'pe_total', 'total_federal_funds',\n", + " 'total_state_funds', 'total_local_funds', 'is_local', 'is_state',\n", + " 'is_federal']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d7173842", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table2 = final_phase_funding_table[to_keep]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ea702394", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table2.head()" + ] + }, + { + "cell_type": "markdown", + "id": "d5a5f2bb", + "metadata": {}, + "source": [ + "#### Sixth: Merge on `phase_dates_df` with all the phase dates" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6025a9f1", + "metadata": {}, + "outputs": [], + "source": [ + "phase_dates_df.ctips_id.nunique(), len(phase_dates_df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7cafa19d", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.ctips_id.nunique(), len(projects_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f7d0efa3", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table.ctips_id.nunique(), len(final_phase_funding_table)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "77a24a8c", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table2 = pd.merge(final_phase_funding_table2, phase_dates_df, on = \"ctips_id\", how = \"outer\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "023f3588", + "metadata": {}, + "outputs": [], + "source": [ + "pd.merge(final_phase_funding_table2, phase_dates_df, on = \"ctips_id\", how = \"outer\", indicator = True)[['_merge']].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3fc198d0", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table2.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b6c2a48f", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.ctips_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "a7dc1126", + "metadata": {}, + "source": [ + "#### Seventh: Merge some other dates found in the `projects` dataframe." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "94edbad1", + "metadata": {}, + "outputs": [], + "source": [ + "project_date_cols = ['const_date', 'rtl', 'ctips_id', 'projcomp_date']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0e2fe592", + "metadata": {}, + "outputs": [], + "source": [ + "projects_dates = projects_table[project_date_cols]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "73a1417a", + "metadata": {}, + "outputs": [], + "source": [ + "project_date_cols.remove('ctips_id')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e4e98bd", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table = projects_table.drop(columns = project_date_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57b81fba", + "metadata": {}, + "outputs": [], + "source": [ + "# I'm only interested in rwos in which at least one of the dates are populated\n", + "projects_dates2 = projects_dates.loc[(projects_dates.rtl != 'datetime64[ns]') | (projects_dates.const_date != 'datetime64[ns]')].reset_index(drop = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "818abaad", + "metadata": {}, + "outputs": [], + "source": [ + "projects_dates2 = projects_dates2.rename(columns = {'const_date': 'construction_completion_date', 'rtl':'ready_to_list_date'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "10943300", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table3 = pd.merge(final_phase_funding_table2, projects_dates2, on = 'ctips_id', how = 'left')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e0b5c9fd", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table3.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5002bf51", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.ctips_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "4f590b0e", + "metadata": {}, + "source": [ + "### Awards Table\n", + "* Take final_fund_m and sort it by year\n", + "* Line year is \"fiscal year of this fund record Note that the year listed is the second in the pair of fiscal year notation. For example if the funds for this record are for fiscal year 1998/99, then this record will hold the value 1999.\"\n", + "* These aren't really programs, funds?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "52bfb5ea", + "metadata": {}, + "outputs": [], + "source": [ + "# del out '',\n", + "awards = funding_w_program_info[['ctips_id','line_year', 'program', 'progdesc']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dbcb89fb", + "metadata": {}, + "outputs": [], + "source": [ + "# Just drop dups across\n", + "awards2 = awards.drop_duplicates().reset_index(drop = True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2fe1729", + "metadata": {}, + "outputs": [], + "source": [ + "len(awards), len(awards2)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "306fde98", + "metadata": {}, + "outputs": [], + "source": [ + "awards3 = awards2.sort_values(by = ['ctips_id','program','line_year', ], ascending = [False, False, False])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef54634e", + "metadata": {}, + "outputs": [], + "source": [ + "awards4 = awards3.drop_duplicates(subset = ['ctips_id','program'])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "84467f0d", + "metadata": {}, + "outputs": [], + "source": [ + "awards4.ctips_id.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "de8480c4", + "metadata": {}, + "source": [ + "#### Check w/ one project" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6c4766bd", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# awards3.loc[awards3.ctips_id == 20700001649]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8048727d", + "metadata": {}, + "outputs": [], + "source": [ + "awards4.loc[awards4.ctips_id == 20700001649]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8f0c78a8", + "metadata": {}, + "outputs": [], + "source": [ + "awards4.ctips_id.value_counts().describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bd4cacf9", + "metadata": {}, + "outputs": [], + "source": [ + "awards4.ctips_id.value_counts().head(10)" + ] + }, + { + "cell_type": "markdown", + "id": "33ec46a8", + "metadata": {}, + "source": [ + "### Political" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d256a277", + "metadata": {}, + "outputs": [], + "source": [ + "political_df = pd.read_sql_query(\"\"\" \n", + "SELECT \n", + "*\n", + "FROM ctips.politcal\n", + "\"\"\", engine) " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "95616eb2", + "metadata": {}, + "outputs": [], + "source": [ + "# Drop any rows with nulls\n", + "# There are a bunch of rows that are 0 \n", + "political_df_without_na = political_df.dropna(how = \"any\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ede35ba", + "metadata": {}, + "outputs": [], + "source": [ + "len(political_df), political_df.projectid.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3d0b0790", + "metadata": {}, + "outputs": [], + "source": [ + "political_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bf56a9c", + "metadata": {}, + "outputs": [], + "source": [ + "len(political_df_without_na)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ef597573", + "metadata": {}, + "outputs": [], + "source": [ + "political_df_without_na.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "49ccb448", + "metadata": {}, + "outputs": [], + "source": [ + "political_df2 = pd.merge(projects_table[['ctips_id', 'projectid']], political_df_without_na, on ='projectid', how = 'inner')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ee357e4e", + "metadata": {}, + "outputs": [], + "source": [ + "political_df2.shape, political_df2.projectid.nunique(), political_df2.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "82c8e22c", + "metadata": {}, + "outputs": [], + "source": [ + "assembly_df = _csis_utils.clean_political(political_df2, 'assembly')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e34803a", + "metadata": {}, + "outputs": [], + "source": [ + "assembly_df.ctips_id.value_counts().head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1d8586e2", + "metadata": {}, + "outputs": [], + "source": [ + "len(assembly_df), assembly_df.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4e36025e", + "metadata": {}, + "outputs": [], + "source": [ + "senate_df = _csis_utils.clean_political(political_df2, 'ssenate')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0d02b8b0", + "metadata": {}, + "outputs": [], + "source": [ + "len(senate_df), senate_df.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "37cfedd3", + "metadata": {}, + "outputs": [], + "source": [ + "ushouse_df = _csis_utils.clean_political(political_df2, 'ushouse')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "adbf20b9", + "metadata": {}, + "outputs": [], + "source": [ + "len(ushouse_df), ushouse_df.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "192df3c5", + "metadata": {}, + "outputs": [], + "source": [ + "ushouse_df.ushouse.value_counts().head()" + ] + }, + { + "cell_type": "markdown", + "id": "ba49612f", + "metadata": {}, + "source": [ + "#### Double check" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "58fbabd0", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "assembly_df.loc[assembly_df.ctips_id == 10900000289]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2940ed0f", + "metadata": {}, + "outputs": [], + "source": [ + "political_df2.loc[political_df2.ctips_id == 10900000289]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "638d08e0", + "metadata": {}, + "outputs": [], + "source": [ + "# projects_table.loc[projects_table.ctips_id == 10900000289]" + ] + }, + { + "cell_type": "markdown", + "id": "c2b6fa42", + "metadata": {}, + "source": [ + "## Save to Excel" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d08d935d", + "metadata": {}, + "outputs": [], + "source": [ + "district_df = projects_table[['ctips_id','districtid']].drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c9655708", + "metadata": {}, + "outputs": [], + "source": [ + "district_df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "31352c30", + "metadata": {}, + "outputs": [], + "source": [ + "# projects_table = projects_table.fillna(projects_table.dtypes.replace({'float64': 0.0, 'object': 'None'}))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "622ac980", + "metadata": {}, + "outputs": [], + "source": [ + "district_df = district_df.fillna(district_df.dtypes.replace({'float64': 0.0, 'object': 'None'})).drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "88b9aa82", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table3 = final_phase_funding_table3.fillna(district_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dcc066c2", + "metadata": {}, + "outputs": [], + "source": [ + "awards4 = awards4.fillna(awards4.dtypes.replace({'float64': 0.0, 'object': 'None'}))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e9967490", + "metadata": {}, + "outputs": [], + "source": [ + "ushouse_df = ushouse_df.fillna(ushouse_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a846c82d", + "metadata": {}, + "outputs": [], + "source": [ + "senate_df = senate_df.fillna(senate_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6bd77a85", + "metadata": {}, + "outputs": [], + "source": [ + "assembly_df = assembly_df.fillna(assembly_df.dtypes.replace({'float64': 0.0, 'object': 'None'}))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb81f83e", + "metadata": {}, + "outputs": [], + "source": [ + "assembly_df.ctips_id.nunique(), awards4.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e26e7701", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table3.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "bb1a3085", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df2.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6ed4aa81", + "metadata": {}, + "outputs": [], + "source": [ + "district_df.ctips_id.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f8d433cc", + "metadata": {}, + "outputs": [], + "source": [ + "# drop_cols = ['chg_offcl', 'chg_qual1', 'chg_qual2','districtid', 'appdate', 'version','projcomp_date', 'agencyid', 'projectid', 'archive', 'agency_name']" + ] + }, + { + "cell_type": "markdown", + "id": "f3552a50", + "metadata": {}, + "source": [ + "#### ASK WHY SOME PROJECTS have 15 which means the project is done? But archive is 0?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dde3c7b4", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.archive.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a804c48f", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table.chg_qual1.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5b8cbb62", + "metadata": {}, + "outputs": [], + "source": [ + "proj_cols_drop = ['appdate', 'archive',\n", + " 'high_ver', 'high_offcl', 'progcode1',\n", + " 'agencyid', 'projectid',\n", + " 'version', 'countyid']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cacb064a", + "metadata": {}, + "outputs": [], + "source": [ + "agency_cols = ['agency_name', 'implpaed_agency',\n", + " 'implpse_agency', 'implrw_agency', 'implcon_agency']" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b7fe0d71", + "metadata": {}, + "outputs": [], + "source": [ + "agency_df = projects_table[agency_cols + ['ctips_id']]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5ca7649f", + "metadata": {}, + "outputs": [], + "source": [ + "agency_df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "34b5ca5c", + "metadata": {}, + "outputs": [], + "source": [ + "projects_table = projects_table.drop(columns = proj_cols_drop + agency_cols)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bc2a413", + "metadata": {}, + "outputs": [], + "source": [ + "district_df.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "18d7f4d8", + "metadata": {}, + "outputs": [], + "source": [ + "county_projects_df2.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b717d091", + "metadata": {}, + "outputs": [], + "source": [ + "final_phase_funding_table3.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "76719372", + "metadata": {}, + "outputs": [], + "source": [ + "awards4.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2b8f5328", + "metadata": {}, + "outputs": [], + "source": [ + "ushouse_df.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c68207d5", + "metadata": {}, + "outputs": [], + "source": [ + "senate_df.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb0745dd", + "metadata": {}, + "outputs": [], + "source": [ + "assembly_df.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc7fed4b", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "with pd.ExcelWriter(\"CTIPS_data.xlsx\") as writer:\n", + " projects_table.to_excel(writer, sheet_name=\"project\", index=False)\n", + " agency_df.to_excel(writer, sheet_name=\"agencies\", index=False)\n", + " district_df.to_excel(writer, sheet_name=\"district\", index=False)\n", + " county_projects_df2.to_excel(writer, sheet_name=\"county\", index=False)\n", + " final_phase_funding_table3.to_excel(writer, sheet_name=\"phase_funding\", index=False)\n", + " awards4.to_excel(writer, sheet_name=\"awards\", index=False)\n", + " ushouse_df.to_excel(writer, sheet_name=\"us_house\", index=False)\n", + " senate_df.to_excel(writer, sheet_name=\"senate\", index=False)\n", + " assembly_df.to_excel(writer, sheet_name=\"assembly\", index=False)\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/project_list/add_lp2000_10_27_2023.ipynb b/project_list/add_lp2000_01_24_2024.ipynb similarity index 51% rename from project_list/add_lp2000_10_27_2023.ipynb rename to project_list/add_lp2000_01_24_2024.ipynb index d957c0aa6..b2a8059cc 100644 --- a/project_list/add_lp2000_10_27_2023.ipynb +++ b/project_list/add_lp2000_01_24_2024.ipynb @@ -2,8 +2,8 @@ "cells": [ { "cell_type": "code", - "execution_count": 143, - "id": "690f3834", + "execution_count": 1, + "id": "b7e8790f", "metadata": {}, "outputs": [], "source": [ @@ -11,13 +11,14 @@ "import sqlalchemy \n", "import sys \n", "import re\n", - "import oracledb " + "import oracledb \n", + "import _database_utils as _utils " ] }, { "cell_type": "code", - "execution_count": 144, - "id": "bc9c8556", + "execution_count": 2, + "id": "b9f4a73c", "metadata": {}, "outputs": [], "source": [ @@ -27,8 +28,8 @@ }, { "cell_type": "code", - "execution_count": 145, - "id": "0d7cc5e7", + "execution_count": 3, + "id": "e06681e8", "metadata": {}, "outputs": [], "source": [ @@ -40,8 +41,8 @@ }, { "cell_type": "code", - "execution_count": 146, - "id": "150ac181", + "execution_count": 5, + "id": "a1bf2f34", "metadata": {}, "outputs": [], "source": [ @@ -50,8 +51,8 @@ }, { "cell_type": "code", - "execution_count": 148, - "id": "bdf25f1a", + "execution_count": 6, + "id": "053f212c", "metadata": {}, "outputs": [], "source": [ @@ -61,29 +62,17 @@ }, { "cell_type": "code", - "execution_count": 149, - "id": "5595350c", + "execution_count": 7, + "id": "f446f7a3", "metadata": {}, "outputs": [], "source": [ "engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH) " ] }, - { - "cell_type": "code", - "execution_count": 150, - "id": "14389690", - "metadata": {}, - "outputs": [], - "source": [ - "def to_snakecase(df):\n", - " df.columns = df.columns.str.lower().str.replace(' ','_')\n", - " return df" - ] - }, { "cell_type": "markdown", - "id": "44b09522", + "id": "5e5e7b24", "metadata": {}, "source": [ "## Projects\n", @@ -92,8 +81,8 @@ }, { "cell_type": "code", - "execution_count": 151, - "id": "c9bb15d4", + "execution_count": 8, + "id": "254d963f", "metadata": { "scrolled": true }, @@ -134,8 +123,8 @@ }, { "cell_type": "code", - "execution_count": 152, - "id": "9c6a6136", + "execution_count": 9, + "id": "7138a0f2", "metadata": {}, "outputs": [], "source": [ @@ -144,8 +133,8 @@ }, { "cell_type": "code", - "execution_count": 153, - "id": "ca5a8ca7", + "execution_count": 10, + "id": "bf77a59d", "metadata": {}, "outputs": [], "source": [ @@ -154,8 +143,8 @@ }, { "cell_type": "code", - "execution_count": 154, - "id": "b88da8ed", + "execution_count": 11, + "id": "6ef5f157", "metadata": {}, "outputs": [], "source": [ @@ -164,8 +153,8 @@ }, { "cell_type": "code", - "execution_count": 155, - "id": "3f0361ad", + "execution_count": 12, + "id": "6dd37cbb", "metadata": {}, "outputs": [], "source": [ @@ -174,8 +163,8 @@ }, { "cell_type": "code", - "execution_count": 285, - "id": "3a8ca472", + "execution_count": 13, + "id": "a91e2bf9", "metadata": {}, "outputs": [ { @@ -183,32 +172,32 @@ "output_type": "stream", "text": [ "\n", - "RangeIndex: 11768 entries, 0 to 11767\n", + "RangeIndex: 11272 entries, 0 to 11271\n", "Data columns (total 20 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", - " 0 project_id 11768 non-null object \n", - " 1 comment_desc 10892 non-null object \n", - " 2 district_code 11767 non-null object \n", - " 3 est_total_prj_costs 1083 non-null float64 \n", - " 4 location_name 11401 non-null object \n", - " 5 project_label_name 10906 non-null object \n", - " 6 original_post_mile_begin_id 750 non-null float64 \n", - " 7 original_post_mile_end_id 576 non-null float64 \n", + " 0 project_id 11272 non-null object \n", + " 1 comment_desc 10399 non-null object \n", + " 2 district_code 11271 non-null object \n", + " 3 est_total_prj_costs 1329 non-null float64 \n", + " 4 location_name 10906 non-null object \n", + " 5 project_label_name 10414 non-null object \n", + " 6 original_post_mile_begin_id 734 non-null float64 \n", + " 7 original_post_mile_end_id 570 non-null float64 \n", " 8 revised_post_mile_begin_ind 20 non-null object \n", " 9 revised_post_mile_end_ind 15 non-null object \n", - " 10 route_name 11447 non-null object \n", - " 11 state_hwy_ind 11405 non-null object \n", + " 10 route_name 10950 non-null object \n", + " 11 state_hwy_ind 10909 non-null object \n", " 12 senate_district_code 0 non-null object \n", - " 13 update_date_time 11699 non-null datetime64[ns]\n", - " 14 agency_name 11767 non-null object \n", - " 15 urban_area_code 5826 non-null object \n", - " 16 county_name 11763 non-null object \n", - " 17 work_type_desc 2887 non-null object \n", - " 18 category_desc 10055 non-null object \n", - " 19 current_phase 11768 non-null object \n", + " 13 update_date_time 11203 non-null datetime64[ns]\n", + " 14 agency_name 11271 non-null object \n", + " 15 urban_area_code 5605 non-null object \n", + " 16 county_name 11267 non-null object \n", + " 17 work_type_desc 2711 non-null object \n", + " 18 category_desc 9581 non-null object \n", + " 19 current_phase 11272 non-null object \n", "dtypes: datetime64[ns](1), float64(3), object(16)\n", - "memory usage: 1.8+ MB\n" + "memory usage: 1.7+ MB\n" ] } ], @@ -218,7 +207,7 @@ }, { "cell_type": "markdown", - "id": "e0fe254b", + "id": "948e473b", "metadata": {}, "source": [ "## EA Number\n", @@ -228,8 +217,8 @@ }, { "cell_type": "code", - "execution_count": 156, - "id": "fb96a824", + "execution_count": 14, + "id": "c78a9816", "metadata": {}, "outputs": [], "source": [ @@ -245,8 +234,8 @@ }, { "cell_type": "code", - "execution_count": 157, - "id": "44018131", + "execution_count": 15, + "id": "3f67c483", "metadata": {}, "outputs": [ { @@ -255,7 +244,7 @@ "((49431, 4), 24130)" ] }, - "execution_count": 157, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -266,8 +255,8 @@ }, { "cell_type": "code", - "execution_count": 158, - "id": "eee7672c", + "execution_count": 16, + "id": "07a9f913", "metadata": {}, "outputs": [], "source": [ @@ -277,8 +266,8 @@ }, { "cell_type": "code", - "execution_count": 159, - "id": "8b7a6f5b", + "execution_count": 17, + "id": "342ee6c4", "metadata": {}, "outputs": [], "source": [ @@ -287,8 +276,8 @@ }, { "cell_type": "code", - "execution_count": 160, - "id": "8ca731d9", + "execution_count": 18, + "id": "b50c43f3", "metadata": {}, "outputs": [], "source": [ @@ -298,8 +287,8 @@ }, { "cell_type": "code", - "execution_count": 161, - "id": "b2cb6d0c", + "execution_count": 19, + "id": "8f43633c", "metadata": {}, "outputs": [], "source": [ @@ -309,8 +298,8 @@ }, { "cell_type": "code", - "execution_count": 162, - "id": "8f6816c6", + "execution_count": 20, + "id": "6c6b17aa", "metadata": {}, "outputs": [], "source": [ @@ -320,8 +309,8 @@ }, { "cell_type": "code", - "execution_count": 163, - "id": "638c4350", + "execution_count": 21, + "id": "6efbe740", "metadata": {}, "outputs": [], "source": [ @@ -331,17 +320,17 @@ }, { "cell_type": "code", - "execution_count": 164, - "id": "0547982c", + "execution_count": 22, + "id": "10aaabe1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "3030" + "2961" ] }, - "execution_count": 164, + "execution_count": 22, "metadata": {}, "output_type": "execute_result" } @@ -352,16 +341,16 @@ }, { "cell_type": "code", - "execution_count": 165, - "id": "5c57055d", + "execution_count": 23, + "id": "98e5481f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "count 2944.00\n", + "count 2880.00\n", "mean 1.03\n", - "std 0.18\n", + "std 0.17\n", "min 1.00\n", "25% 1.00\n", "50% 1.00\n", @@ -370,7 +359,7 @@ "Name: project_id, dtype: float64" ] }, - "execution_count": 165, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -381,17 +370,17 @@ }, { "cell_type": "code", - "execution_count": 166, - "id": "f245bf36", + "execution_count": 24, + "id": "8ae90cf5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "2944" + "2880" ] }, - "execution_count": 166, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -402,22 +391,22 @@ }, { "cell_type": "code", - "execution_count": 167, - "id": "a03488a6", + "execution_count": 25, + "id": "ae8a5b68", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "5008(072) 3\n", + "5953(536) 3\n", "5932(042) 3\n", "5006(504) 3\n", - "5006(635) 3\n", - "5953(536) 3\n", + "5435(010) 2\n", "Name: project_id, dtype: int64" ] }, - "execution_count": 167, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -428,8 +417,8 @@ }, { "cell_type": "code", - "execution_count": 168, - "id": "2a24acdd", + "execution_count": 26, + "id": "d8e0c58a", "metadata": {}, "outputs": [ { @@ -460,39 +449,17 @@ " \n", " \n", " \n", - " \n", - " 2381\n", - " 07\n", - " 4S6608\n", - " 5006(635)\n", - " 2009-09-10 13:58:44\n", - " \n", - " \n", - " 2382\n", - " 07\n", - " 933575\n", - " 5006(635)\n", - " 2009-07-02 14:46:18\n", - " \n", - " \n", - " 2383\n", - " 07\n", - " 4U4414\n", - " 5006(635)\n", - " 2009-09-10 13:56:35\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " district_code expense_authorization_id project_id ea_assign_date\n", - "2381 07 4S6608 5006(635) 2009-09-10 13:58:44\n", - "2382 07 933575 5006(635) 2009-07-02 14:46:18\n", - "2383 07 4U4414 5006(635) 2009-09-10 13:56:35" + "Empty DataFrame\n", + "Columns: [district_code, expense_authorization_id, project_id, ea_assign_date]\n", + "Index: []" ] }, - "execution_count": 168, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -503,8 +470,8 @@ }, { "cell_type": "code", - "execution_count": 169, - "id": "682ff6e0", + "execution_count": 27, + "id": "6d002cda", "metadata": {}, "outputs": [], "source": [ @@ -520,17 +487,17 @@ }, { "cell_type": "code", - "execution_count": 170, - "id": "c9bfb0a4", + "execution_count": 28, + "id": "df8f4a43", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "2944" + "2880" ] }, - "execution_count": 170, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } @@ -541,8 +508,8 @@ }, { "cell_type": "code", - "execution_count": 171, - "id": "c6993849", + "execution_count": 29, + "id": "4a81759a", "metadata": {}, "outputs": [ { @@ -615,7 +582,7 @@ "4 04 985979 5178(016)" ] }, - "execution_count": 171, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -626,17 +593,17 @@ }, { "cell_type": "code", - "execution_count": 172, - "id": "f9eff2f3", + "execution_count": 30, + "id": "28a7a7e1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "2873" + "2814" ] }, - "execution_count": 172, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -647,7 +614,7 @@ }, { "cell_type": "markdown", - "id": "19049968", + "id": "6f73b4a0", "metadata": {}, "source": [ "#### The same EA number matches to multiple projects\n", @@ -657,8 +624,8 @@ }, { "cell_type": "code", - "execution_count": 286, - "id": "503db9e1", + "execution_count": 31, + "id": "6575e623", "metadata": {}, "outputs": [ { @@ -690,21 +657,21 @@ " \n", " \n", " \n", - " 127\n", + " 120\n", " 01\n", " 924969\n", " 5904(114)\n", " 2011-02-28 10:37:39\n", " \n", " \n", - " 404\n", + " 390\n", " 03\n", " 924969\n", " 5238(018)\n", " 1998-06-04 00:00:00\n", " \n", " \n", - " 1428\n", + " 1396\n", " 08\n", " 924969\n", " NBIL(502)\n", @@ -716,12 +683,12 @@ ], "text/plain": [ " district_code expense_authorization_id project_id ea_assign_date\n", - "127 01 924969 5904(114) 2011-02-28 10:37:39\n", - "404 03 924969 5238(018) 1998-06-04 00:00:00\n", - "1428 08 924969 NBIL(502) 2006-06-23 16:18:52" + "120 01 924969 5904(114) 2011-02-28 10:37:39\n", + "390 03 924969 5238(018) 1998-06-04 00:00:00\n", + "1396 08 924969 NBIL(502) 2006-06-23 16:18:52" ] }, - "execution_count": 286, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -732,8 +699,8 @@ }, { "cell_type": "code", - "execution_count": 287, - "id": "0582d2a5", + "execution_count": 32, + "id": "3ebc8d00", "metadata": {}, "outputs": [ { @@ -765,14 +732,14 @@ " \n", " \n", " \n", - " 424\n", + " 409\n", " 08\n", " 924360\n", " 0061(025)\n", " 1998-12-23 00:00:00\n", " \n", " \n", - " 1248\n", + " 1218\n", " 04\n", " 924360\n", " 6003(030)\n", @@ -784,11 +751,11 @@ ], "text/plain": [ " district_code expense_authorization_id project_id ea_assign_date\n", - "424 08 924360 0061(025) 1998-12-23 00:00:00\n", - "1248 04 924360 6003(030) 2005-05-17 15:25:28" + "409 08 924360 0061(025) 1998-12-23 00:00:00\n", + "1218 04 924360 6003(030) 2005-05-17 15:25:28" ] }, - "execution_count": 287, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -799,8 +766,8 @@ }, { "cell_type": "code", - "execution_count": 175, - "id": "e58e2bc7", + "execution_count": 33, + "id": "8c407170", "metadata": {}, "outputs": [], "source": [ @@ -809,8 +776,8 @@ }, { "cell_type": "code", - "execution_count": 176, - "id": "36be128e", + "execution_count": 34, + "id": "d5982845", "metadata": {}, "outputs": [], "source": [ @@ -819,7 +786,7 @@ }, { "cell_type": "markdown", - "id": "f22f3bd9", + "id": "b41a3e92", "metadata": {}, "source": [ "* Shares the EA of 924360" @@ -827,8 +794,8 @@ }, { "cell_type": "code", - "execution_count": 177, - "id": "c4327e30", + "execution_count": 35, + "id": "00e0889b", "metadata": {}, "outputs": [], "source": [ @@ -837,7 +804,7 @@ }, { "cell_type": "markdown", - "id": "7e25fe9c", + "id": "cbd3c992", "metadata": {}, "source": [ "## EFIS_MV_BUD_STRU_94_LVL_3_VW\n", @@ -848,8 +815,8 @@ }, { "cell_type": "code", - "execution_count": 178, - "id": "f9f06cfa", + "execution_count": 36, + "id": "adeb37a5", "metadata": {}, "outputs": [], "source": [ @@ -869,17 +836,17 @@ }, { "cell_type": "code", - "execution_count": 179, - "id": "dfb521d0", + "execution_count": 37, + "id": "664c144a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(45227, 7)" + "(45666, 7)" ] }, - "execution_count": 179, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } @@ -890,17 +857,17 @@ }, { "cell_type": "code", - "execution_count": 180, - "id": "52422939", + "execution_count": 38, + "id": "1cc8c5d9", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "19552" + "19821" ] }, - "execution_count": 180, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -911,7 +878,7 @@ }, { "cell_type": "markdown", - "id": "e8befd89", + "id": "2a74da37", "metadata": {}, "source": [ "### Efis Join\n", @@ -920,8 +887,8 @@ }, { "cell_type": "code", - "execution_count": 181, - "id": "e046b88c", + "execution_count": 39, + "id": "972329d2", "metadata": {}, "outputs": [], "source": [ @@ -936,7 +903,7 @@ }, { "cell_type": "markdown", - "id": "d1c4df99", + "id": "3c566ba3", "metadata": {}, "source": [ "* Exclude project status because it's just the financial status of the project, not construction or whatever." @@ -944,8 +911,8 @@ }, { "cell_type": "code", - "execution_count": 182, - "id": "a7232b09", + "execution_count": 40, + "id": "c936398b", "metadata": {}, "outputs": [], "source": [ @@ -958,8 +925,8 @@ }, { "cell_type": "code", - "execution_count": 183, - "id": "c1a9bf0e", + "execution_count": 41, + "id": "a39f1bad", "metadata": {}, "outputs": [], "source": [ @@ -968,8 +935,8 @@ }, { "cell_type": "code", - "execution_count": 184, - "id": "e811b5e2", + "execution_count": 42, + "id": "e3c9315e", "metadata": {}, "outputs": [], "source": [ @@ -979,8 +946,8 @@ }, { "cell_type": "code", - "execution_count": 185, - "id": "9c7336f8", + "execution_count": 43, + "id": "c818486a", "metadata": {}, "outputs": [], "source": [ @@ -990,8 +957,8 @@ }, { "cell_type": "code", - "execution_count": 186, - "id": "4d42c325", + "execution_count": 44, + "id": "918cc1c9", "metadata": {}, "outputs": [], "source": [ @@ -1002,21 +969,21 @@ }, { "cell_type": "code", - "execution_count": 187, - "id": "0f8623fb", + "execution_count": 45, + "id": "13a41288", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "_merge \n", - "left_only 34284\n", - "both 10958\n", - "right_only 85\n", + "left_only 34832\n", + "both 10849\n", + "right_only 83\n", "dtype: int64" ] }, - "execution_count": 187, + "execution_count": 45, "metadata": {}, "output_type": "execute_result" } @@ -1027,8 +994,8 @@ }, { "cell_type": "code", - "execution_count": 188, - "id": "8a6cde72", + "execution_count": 46, + "id": "bd3a6659", "metadata": {}, "outputs": [], "source": [ @@ -1037,7 +1004,7 @@ }, { "cell_type": "markdown", - "id": "35caae64", + "id": "1c85c416", "metadata": {}, "source": [ "* 77 project ids missing after inner join." @@ -1045,17 +1012,17 @@ }, { "cell_type": "code", - "execution_count": 189, - "id": "afceb420", + "execution_count": 47, + "id": "7b2672b6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4788" + "4810" ] }, - "execution_count": 189, + "execution_count": 47, "metadata": {}, "output_type": "execute_result" } @@ -1066,17 +1033,17 @@ }, { "cell_type": "code", - "execution_count": 190, - "id": "f89ecbbd", + "execution_count": 48, + "id": "a5b6529d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4718" + "4742" ] }, - "execution_count": 190, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -1087,7 +1054,7 @@ }, { "cell_type": "markdown", - "id": "6f2a4ece", + "id": "bfb5e764", "metadata": {}, "source": [ "## Subset only for the relevant project_ids from `Projects`\n", @@ -1096,8 +1063,8 @@ }, { "cell_type": "code", - "execution_count": 191, - "id": "f11e6fc4", + "execution_count": 49, + "id": "0c99473a", "metadata": {}, "outputs": [], "source": [ @@ -1106,17 +1073,17 @@ }, { "cell_type": "code", - "execution_count": 192, - "id": "83b1c540", + "execution_count": 50, + "id": "0be547fb", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(11768, (11768, 1))" + "(11272, (11272, 1))" ] }, - "execution_count": 192, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } @@ -1128,17 +1095,17 @@ }, { "cell_type": "code", - "execution_count": 193, - "id": "983ce05c", + "execution_count": 51, + "id": "98a8470f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(11768, 1)" + "(11272, 1)" ] }, - "execution_count": 193, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -1149,21 +1116,21 @@ }, { "cell_type": "code", - "execution_count": 194, - "id": "a2cf22ad", + "execution_count": 52, + "id": "abf5aa6c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "_merge \n", - "both 10289\n", - "right_only 7463\n", - "left_only 669\n", + "both 10186\n", + "right_only 6942\n", + "left_only 663\n", "dtype: int64" ] }, - "execution_count": 194, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -1174,8 +1141,8 @@ }, { "cell_type": "code", - "execution_count": 195, - "id": "cb24c2e2", + "execution_count": 53, + "id": "715ef53f", "metadata": {}, "outputs": [], "source": [ @@ -1184,17 +1151,17 @@ }, { "cell_type": "code", - "execution_count": 196, - "id": "9b238f72", + "execution_count": 54, + "id": "a0b7ce55", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4305" + "4330" ] }, - "execution_count": 196, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -1206,8 +1173,8 @@ }, { "cell_type": "code", - "execution_count": 197, - "id": "f83ce15d", + "execution_count": 55, + "id": "a276ed04", "metadata": {}, "outputs": [ { @@ -1221,7 +1188,7 @@ "Name: project_id, dtype: int64" ] }, - "execution_count": 197, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -1232,7 +1199,7 @@ }, { "cell_type": "markdown", - "id": "3e7dd4af", + "id": "e9341b04", "metadata": {}, "source": [ "## Bring in pect_description for `Projects` -> Double Check\n", @@ -1244,13 +1211,13 @@ }, { "cell_type": "code", - "execution_count": 198, - "id": "33194243", + "execution_count": 56, + "id": "67c1fc59", "metadata": {}, "outputs": [], "source": [ "def load_pec(excel_file:str)-> pd.DataFrame:\n", - " df = to_snakecase(pd.read_excel(excel_file))\n", + " df = _utils.to_snakecase(pd.read_excel(excel_file))\n", " \n", " # Drop rows that are all nan\n", " df = df.dropna(how='all').reset_index(drop = True)\n", @@ -1286,8 +1253,8 @@ }, { "cell_type": "code", - "execution_count": 199, - "id": "96b9ab27", + "execution_count": 57, + "id": "a5ddf1bf", "metadata": {}, "outputs": [ { @@ -1304,8 +1271,8 @@ }, { "cell_type": "code", - "execution_count": 200, - "id": "da554116", + "execution_count": 58, + "id": "72100e15", "metadata": {}, "outputs": [ { @@ -1314,7 +1281,7 @@ "(799, 4)" ] }, - "execution_count": 200, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } @@ -1325,8 +1292,8 @@ }, { "cell_type": "code", - "execution_count": 201, - "id": "9abb7310", + "execution_count": 59, + "id": "95554c2e", "metadata": {}, "outputs": [ { @@ -1358,43 +1325,48 @@ " \n", " \n", " \n", - " 720\n", - " 4050203\n", - " 845\n", - " Dumbarton Bridge RM1\n", - " Toll Bridge Program\n", + " 682\n", + " 4050201\n", + " 151\n", + " Drainage System Restoration\n", + " State Hwy Operation & Protection Program (SHOPP)\n", " \n", " \n", - " 476\n", - " 2080385\n", - " 851\n", - " Reimbursement from BATA - Antioch\n", - " Reimbursement of Toll Bridge Maintenance and Toll Collection Costs from the Bay Area Toll Authority ( BATA)\n", + " 494\n", + " 2080437\n", + " 0\n", + " TMS Electrical Material Procurement\n", + " Transportation Management System (TMS) Electrical Material Procurement\n", " \n", " \n", - " 486\n", - " 2080410\n", - " 0\n", - " Lighting\n", - " Lighting\n", + " 331\n", + " 2030010\n", + " 630\n", + " Rebuilding American Infrastructure with Sustainability and Equity (RAISE) and Multimodal Project Discretionary Grant Programs (e.g., INFRA, MEGA, RSTG or RURAL)\n", + " Local Assistance\n", " \n", " \n", "\n", "" ], "text/plain": [ - " pec pect pect_description \\\n", - "720 4050203 845 Dumbarton Bridge RM1 \n", - "476 2080385 851 Reimbursement from BATA - Antioch \n", - "486 2080410 0 Lighting \n", - "\n", - " program \n", - "720 Toll Bridge Program \n", - "476 Reimbursement of Toll Bridge Maintenance and Toll Collection Costs from the Bay Area Toll Authority ( BATA) \n", - "486 Lighting " + " pec pect \\\n", + "682 4050201 151 \n", + "494 2080437 0 \n", + "331 2030010 630 \n", + "\n", + " pect_description \\\n", + "682 Drainage System Restoration \n", + "494 TMS Electrical Material Procurement \n", + "331 Rebuilding American Infrastructure with Sustainability and Equity (RAISE) and Multimodal Project Discretionary Grant Programs (e.g., INFRA, MEGA, RSTG or RURAL) \n", + "\n", + " program \n", + "682 State Hwy Operation & Protection Program (SHOPP) \n", + "494 Transportation Management System (TMS) Electrical Material Procurement \n", + "331 Local Assistance " ] }, - "execution_count": 201, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -1405,7 +1377,7 @@ }, { "cell_type": "markdown", - "id": "4e3e39a6", + "id": "7ae7dade", "metadata": {}, "source": [ "### Turn this part to script once finalized" @@ -1413,8 +1385,8 @@ }, { "cell_type": "code", - "execution_count": 202, - "id": "9d07dc9c", + "execution_count": 60, + "id": "d16d49fe", "metadata": {}, "outputs": [], "source": [ @@ -1423,8 +1395,8 @@ }, { "cell_type": "code", - "execution_count": 203, - "id": "50498b4b", + "execution_count": 61, + "id": "1915efd6", "metadata": {}, "outputs": [], "source": [ @@ -1433,8 +1405,8 @@ }, { "cell_type": "code", - "execution_count": 204, - "id": "de116827", + "execution_count": 62, + "id": "5b161320", "metadata": {}, "outputs": [ { @@ -1492,7 +1464,7 @@ "0 -38.99 535 6200(024) " ] }, - "execution_count": 204, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" } @@ -1503,8 +1475,8 @@ }, { "cell_type": "code", - "execution_count": 205, - "id": "f6915f79", + "execution_count": 63, + "id": "2b9bc1bf", "metadata": {}, "outputs": [ { @@ -1544,45 +1516,45 @@ " \n", " \n", " \n", - " 7498\n", - " 0813000007\n", + " 1678\n", + " 0316000060\n", " 0890\n", " 2030010\n", - " 1112\n", - " 1645967.06\n", - " 1645967.06\n", - " 300\n", - " 5954(108)\n", + " 2122\n", + " 200000.00\n", + " 62549.18\n", + " 820\n", + " 6203(069)\n", " 2030010\n", - " 300.00\n", - " Highway Bridge\n", + " 820.00\n", + " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", " Local Assistance\n", " \n", " \n", - " 4227\n", - " 0517000187\n", - " 0890\n", - " 2030010\n", - " 2122\n", - " 82876.00\n", - " 0.00\n", - " 650\n", - " 32L0(084)\n", - " 2030010\n", - " 650.00\n", - " Emergency Relief (ER)\n", - " Local Assistance\n", + " 6994\n", + " 0722000309\n", + " 3290\n", + " 2030720\n", + " 2021\n", + " 10000.00\n", + " 3520.00\n", + " 100\n", + " 5352(023)\n", + " 2030720\n", + " 100.00\n", + " Active Transportation Program (ATP)\n", + " Active Transportation Program (ATP)\n", " \n", " \n", - " 2899\n", - " 0416000110\n", + " 2823\n", + " 0415000111\n", " 0890\n", " 2030010\n", - " 1516\n", - " 126000.00\n", - " 126000.00\n", + " 1617\n", + " 350000.00\n", + " 350000.00\n", " 300\n", - " 5094(065)\n", + " 6003(052)\n", " 2030010\n", " 300.00\n", " Highway Bridge\n", @@ -1594,22 +1566,27 @@ ], "text/plain": [ " adv_project_id fund_code pec_code appropriation_category_code \\\n", - "7498 0813000007 0890 2030010 1112 \n", - "4227 0517000187 0890 2030010 2122 \n", - "2899 0416000110 0890 2030010 1516 \n", + "1678 0316000060 0890 2030010 2122 \n", + "6994 0722000309 3290 2030720 2021 \n", + "2823 0415000111 0890 2030010 1617 \n", "\n", " curr_bud_am cash_exp_am pect_task_code project_id pec pect \\\n", - "7498 1645967.06 1645967.06 300 5954(108) 2030010 300.00 \n", - "4227 82876.00 0.00 650 32L0(084) 2030010 650.00 \n", - "2899 126000.00 126000.00 300 5094(065) 2030010 300.00 \n", + "1678 200000.00 62549.18 820 6203(069) 2030010 820.00 \n", + "6994 10000.00 3520.00 100 5352(023) 2030720 100.00 \n", + "2823 350000.00 350000.00 300 6003(052) 2030010 300.00 \n", "\n", - " pect_description program \n", - "7498 Highway Bridge Local Assistance \n", - "4227 Emergency Relief (ER) Local Assistance \n", - "2899 Highway Bridge Local Assistance " + " pect_description \\\n", + "1678 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", + "6994 Active Transportation Program (ATP) \n", + "2823 Highway Bridge \n", + "\n", + " program \n", + "1678 Local Assistance \n", + "6994 Active Transportation Program (ATP) \n", + "2823 Local Assistance " ] }, - "execution_count": 205, + "execution_count": 63, "metadata": {}, "output_type": "execute_result" } @@ -1620,17 +1597,17 @@ }, { "cell_type": "code", - "execution_count": 206, - "id": "d17b923a", + "execution_count": 64, + "id": "f0749fec", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(10289, 4305)" + "(10186, 4330)" ] }, - "execution_count": 206, + "execution_count": 64, "metadata": {}, "output_type": "execute_result" } @@ -1641,8 +1618,8 @@ }, { "cell_type": "code", - "execution_count": 207, - "id": "84a31505", + "execution_count": 65, + "id": "23164a0d", "metadata": {}, "outputs": [], "source": [ @@ -1652,8 +1629,8 @@ }, { "cell_type": "code", - "execution_count": 208, - "id": "863b113e", + "execution_count": 66, + "id": "af61c211", "metadata": {}, "outputs": [], "source": [ @@ -1663,17 +1640,17 @@ }, { "cell_type": "code", - "execution_count": 209, - "id": "8d33c12c", + "execution_count": 67, + "id": "26e84b16", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "10096" + "9999" ] }, - "execution_count": 209, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -1685,8 +1662,8 @@ }, { "cell_type": "code", - "execution_count": 210, - "id": "27c3a8a5", + "execution_count": 68, + "id": "25132e11", "metadata": {}, "outputs": [], "source": [ @@ -1695,8 +1672,8 @@ }, { "cell_type": "code", - "execution_count": 211, - "id": "56fbc89c", + "execution_count": 69, + "id": "6b9911a3", "metadata": {}, "outputs": [], "source": [ @@ -1708,8 +1685,8 @@ }, { "cell_type": "code", - "execution_count": 212, - "id": "f53db68f", + "execution_count": 70, + "id": "f4502e96", "metadata": {}, "outputs": [], "source": [ @@ -1718,8 +1695,8 @@ }, { "cell_type": "code", - "execution_count": 213, - "id": "62d990e9", + "execution_count": 71, + "id": "13866e8a", "metadata": {}, "outputs": [], "source": [ @@ -1728,8 +1705,8 @@ }, { "cell_type": "code", - "execution_count": 214, - "id": "d065468d", + "execution_count": 72, + "id": "e428d60c", "metadata": {}, "outputs": [], "source": [ @@ -1738,17 +1715,17 @@ }, { "cell_type": "code", - "execution_count": 215, - "id": "182af512", + "execution_count": 73, + "id": "c23e02b1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(4305, 4305, 4305)" + "(4330, 4330, 4330)" ] }, - "execution_count": 215, + "execution_count": 73, "metadata": {}, "output_type": "execute_result" } @@ -1759,18 +1736,18 @@ }, { "cell_type": "code", - "execution_count": 216, - "id": "391cfc99", + "execution_count": 74, + "id": "9106c0ef", "metadata": {}, "outputs": [], "source": [ - "pect_df3 = to_snakecase(pect_df3)" + "pect_df3 = _utils.to_snakecase(pect_df3)" ] }, { "cell_type": "code", - "execution_count": 217, - "id": "98a7e1a6", + "execution_count": 75, + "id": "653ba5a5", "metadata": { "scrolled": true }, @@ -1783,8 +1760,8 @@ }, { "cell_type": "code", - "execution_count": 218, - "id": "6a5e322b", + "execution_count": 76, + "id": "eed16ac2", "metadata": {}, "outputs": [], "source": [ @@ -1793,8 +1770,8 @@ }, { "cell_type": "code", - "execution_count": 219, - "id": "1e767e54", + "execution_count": 77, + "id": "be991521", "metadata": {}, "outputs": [], "source": [ @@ -1804,21 +1781,21 @@ }, { "cell_type": "code", - "execution_count": 220, - "id": "864532fc", + "execution_count": 78, + "id": "6c7b81c5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "_merge \n", - "right_only 7463\n", - "both 4305\n", + "right_only 6942\n", + "both 4330\n", "left_only 0\n", "dtype: int64" ] }, - "execution_count": 220, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } @@ -1829,8 +1806,8 @@ }, { "cell_type": "code", - "execution_count": 221, - "id": "499e4dcc", + "execution_count": 79, + "id": "f7e7d7f3", "metadata": {}, "outputs": [], "source": [ @@ -1841,8 +1818,8 @@ }, { "cell_type": "code", - "execution_count": 222, - "id": "4c59d9e9", + "execution_count": 80, + "id": "034acf09", "metadata": {}, "outputs": [], "source": [ @@ -1852,17 +1829,17 @@ }, { "cell_type": "code", - "execution_count": 223, - "id": "83765a97", + "execution_count": 81, + "id": "ad74eafc", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "11768" + "11272" ] }, - "execution_count": 223, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -1873,8 +1850,8 @@ }, { "cell_type": "code", - "execution_count": 224, - "id": "a02d34e9", + "execution_count": 82, + "id": "6e2873ec", "metadata": {}, "outputs": [], "source": [ @@ -1884,7 +1861,7 @@ }, { "cell_type": "markdown", - "id": "b08e0591", + "id": "d414f29a", "metadata": {}, "source": [ "### Double check" @@ -1892,8 +1869,8 @@ }, { "cell_type": "code", - "execution_count": 225, - "id": "d7bb56fa", + "execution_count": 83, + "id": "d2561ecb", "metadata": {}, "outputs": [ { @@ -1924,25 +1901,25 @@ " \n", " \n", " \n", - " 1610\n", + " 1615\n", " Highway Bridge\n", " 690839.49\n", " 5918(101)\n", " \n", " \n", - " 1611\n", + " 1616\n", " Earmarks Projects (HPP, DEMO CPFCDS, etc.)\n", " 238679.79\n", " 5918(101)\n", " \n", " \n", - " 1612\n", + " 1617\n", " Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP)\n", " 0.00\n", " 5918(101)\n", " \n", " \n", - " 1613\n", + " 1618\n", " Highway Bridge\n", " 472887.51\n", " 5918(101)\n", @@ -1953,19 +1930,19 @@ ], "text/plain": [ " pect_description \\\n", - "1610 Highway Bridge \n", - "1611 Earmarks Projects (HPP, DEMO CPFCDS, etc.) \n", - "1612 Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP) \n", - "1613 Highway Bridge \n", + "1615 Highway Bridge \n", + "1616 Earmarks Projects (HPP, DEMO CPFCDS, etc.) \n", + "1617 Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP) \n", + "1618 Highway Bridge \n", "\n", " curr_bud_am project_id \n", - "1610 690839.49 5918(101) \n", - "1611 238679.79 5918(101) \n", - "1612 0.00 5918(101) \n", - "1613 472887.51 5918(101) " + "1615 690839.49 5918(101) \n", + "1616 238679.79 5918(101) \n", + "1617 0.00 5918(101) \n", + "1618 472887.51 5918(101) " ] }, - "execution_count": 225, + "execution_count": 83, "metadata": {}, "output_type": "execute_result" } @@ -1976,93 +1953,93 @@ }, { "cell_type": "code", - "execution_count": 226, - "id": "8c1f5e90", + "execution_count": 84, + "id": "50113d38", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", + " }
project_id comment_desc district_code est_total_prj_costs location_name project_label_name original_post_mile_begin_id original_post_mile_end_id revised_post_mile_begin_ind revised_post_mile_end_ind route_name state_hwy_ind senate_district_code update_date_time agency_name urban_area_code county_name work_type_desc category_desc current_phase active_transportation_program_(atp) bridge_inspection_&_scour_evaluation covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation carbon_reduction_program_(crp) congestion_mitigation_&_air_quality_improvement_program_(cmaq) coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds corridor_mobility_improvement_account_(cmia)_program county_exchange_funds county_state_match_program earmarks_projects_(hpp,_demo_cpfcds,_etc.) emergency_relief_(er) ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program funds_for_planning,_programming_and_monitoring_-_rip general_funded_designated_programs hazard_elimination_safety_(hes) high_risk_rural_roads_program_(hr3) highway_bridge_ highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund highway_safety_improvement_program_(hsip)_(non-infrastructure) highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund local_partnership_program_(lpp_–_competitive)_ local_roads local_roads_rehabilitation railroad_grade_crossing_protection railroad_grade_separations rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_ regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system) regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip) regional_transportation_planning_agency_(rtpa)_stp_match_exchange sb1_funded_freeway_service_patrol shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds safe_routes_to_school_(sr2s_and_srts) set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act solutions_for_congested_corridors_program_(sccp) special_programs state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic) structures_seismic_retrofit_ trade_corridor_enhancement_account_(tcea)_programs_–_local_share trade_corridor_enhancement_account_(tcea)_programs_–_state_share trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads traffic_congestion_relief_program_(_tcrp_) unknown
\n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
project_id comment_desc district_code est_total_prj_costs location_name project_label_name original_post_mile_begin_id original_post_mile_end_id revised_post_mile_begin_ind revised_post_mile_end_ind route_name state_hwy_ind senate_district_code update_date_time agency_name urban_area_code county_name work_type_desc category_desc current_phase active_transportation_program_(atp) bridge_inspection_&_scour_evaluation covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation carbon_reduction_program_(crp) congestion_mitigation_&_air_quality_improvement_program_(cmaq) coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds corridor_mobility_improvement_account_(cmia)_program county_exchange_funds county_state_match_program earmarks_projects_(hpp,_demo_cpfcds,_etc.) emergency_relief_(er) ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program funds_for_planning,_programming_and_monitoring_-_rip general_funded_designated_programs hazard_elimination_safety_(hes) high_risk_rural_roads_program_(hr3) highway_bridge_ highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund highway_safety_improvement_program_(hsip)_(non-infrastructure) highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund local_partnership_program_(lpp_–_competitive)_ local_roads local_roads_rehabilitation railroad_grade_crossing_protection railroad_grade_separations rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_ regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system) regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip) regional_transportation_planning_agency_(rtpa)_stp_match_exchange sb1_funded_freeway_service_patrol shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds safe_routes_to_school_(sr2s_and_srts) set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act solutions_for_congested_corridors_program_(sccp) special_programs state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic) structures_seismic_retrofit_ trade_corridor_enhancement_account_(tcea)_programs_–_local_share trade_corridor_enhancement_account_(tcea)_programs_–_state_share trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads traffic_congestion_relief_program_(_tcrp_) unknown
14135918(101)4-26-2023: told Neal Hay to do a BAR request and that he cannot ask for more than what was lapsed - JC\n", + " 12775918(101)4-26-2023: told Neal Hay to do a BAR request and that he cannot ask for more than what was lapsed - JC\n", "\n", "1/10/22: TCT JWalton adv of CWA expring and funds lapsing. need invoice by Apr 1, 2022. ab\n", "8/2/17: email SRiddle re: inactive status. ab\n", "County will seek to replace (SR= 53.6)03nanOn Howsley Road, 1.02 Mile East of State Route 99, BrBridge ReplacementnannanNoneNone0-CRNNone2023-04-26 15:16:25Sutter CountyNoneSutter CountyBridge Replacement - No Added CapacityBridge Replacementsingle phaseNoNoNoNoNoNoNoNoNoYesNoNoNoNoNoNoYesNoNoNoNoNoNoNoNoNoNoYesNoNoNoNoNoNoNoNoNoNoNoNoNoNo03nanOn Howsley Road, 1.02 Mile East of State Route 99, BrBridge ReplacementnannanNoneNone0-CRNNone2023-04-26 15:16:25Sutter CountyNoneSutter CountyBridge Replacement - No Added CapacityBridge Replacementsingle phaseNoNoNoNoNoNoNoNoNoYesNoNoNoNoNoNoYesNoNoNoNoNoNoNoNoNoNoYesNoNoNoNoNoNoNoNoNoNoNoNoNoNo
" ], "text/plain": [ - "" + "" ] }, - "execution_count": 226, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -2073,7 +2050,7 @@ }, { "cell_type": "markdown", - "id": "e2d50c21", + "id": "734fa9d5", "metadata": {}, "source": [ "## Phase_Funding Table" @@ -2081,7 +2058,7 @@ }, { "cell_type": "markdown", - "id": "ebf0c60f", + "id": "8d673df6", "metadata": {}, "source": [ "### Bring in fund_code\n", @@ -2090,8 +2067,8 @@ }, { "cell_type": "code", - "execution_count": 227, - "id": "8a344942", + "execution_count": 85, + "id": "38f9b021", "metadata": {}, "outputs": [], "source": [ @@ -2107,8 +2084,8 @@ }, { "cell_type": "code", - "execution_count": 228, - "id": "7344566e", + "execution_count": 86, + "id": "cf667c4c", "metadata": {}, "outputs": [], "source": [ @@ -2117,8 +2094,8 @@ }, { "cell_type": "code", - "execution_count": 229, - "id": "c8bc42a1", + "execution_count": 87, + "id": "eed59562", "metadata": {}, "outputs": [ { @@ -2148,32 +2125,32 @@ " \n", " \n", " \n", - " 3\n", - " 0183\n", - " Environmental Enhanc & Mitigat Prgm Fd\n", + " 12\n", + " 6056\n", + " Trade Corridors Improvement Fund\n", " \n", " \n", - " 10\n", - " 3291\n", - " Trade Corridor Enhancement Account, STF\n", + " 5\n", + " 3007\n", + " Traffic Congestion Relief Fund\n", " \n", " \n", - " 1\n", - " 0045\n", - " Bicycle Transportation Account\n", + " 3\n", + " 0183\n", + " Environmental Enhanc & Mitigat Prgm Fd\n", " \n", " \n", "\n", "" ], "text/plain": [ - " 0001 general_fund\n", - "3 0183 Environmental Enhanc & Mitigat Prgm Fd\n", - "10 3291 Trade Corridor Enhancement Account, STF\n", - "1 0045 Bicycle Transportation Account" + " 0001 general_fund\n", + "12 6056 Trade Corridors Improvement Fund\n", + "5 3007 Traffic Congestion Relief Fund\n", + "3 0183 Environmental Enhanc & Mitigat Prgm Fd" ] }, - "execution_count": 229, + "execution_count": 87, "metadata": {}, "output_type": "execute_result" } @@ -2184,8 +2161,8 @@ }, { "cell_type": "code", - "execution_count": 230, - "id": "d5432747", + "execution_count": 88, + "id": "990c4044", "metadata": {}, "outputs": [], "source": [ @@ -2195,17 +2172,17 @@ }, { "cell_type": "code", - "execution_count": 231, - "id": "9d010bfb", + "execution_count": 89, + "id": "2d4e7ec1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4305" + "4330" ] }, - "execution_count": 231, + "execution_count": 89, "metadata": {}, "output_type": "execute_result" } @@ -2216,8 +2193,8 @@ }, { "cell_type": "code", - "execution_count": 232, - "id": "acaf7b88", + "execution_count": 90, + "id": "67589974", "metadata": {}, "outputs": [], "source": [ @@ -2226,32 +2203,32 @@ }, { "cell_type": "code", - "execution_count": 233, - "id": "4e9538f6", + "execution_count": 91, + "id": "bc661a10", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Federal Trust Fund 7647\n", - "State Highway Account 1575\n", - "Road Maintenance & Rehabilitation Account, STF 433\n", - "Unknown 402\n", - "Local Bridge Seismic Retrofit Acct 73\n", + "Federal Trust Fund 7464\n", + "State Highway Account 1563\n", + "Unknown 505\n", + "Road Maintenance & Rehabilitation Account, STF 424\n", + "Local Bridge Seismic Retrofit Acct 72\n", "Environmental Enhanc & Mitigat Prgm Fd 56\n", - "Transportation Investment Fund 29\n", + "Transportation Investment Fund 28\n", "Transportation Deferred Investment Fund 19\n", "Trade Corridor Enhancement Account, STF 18\n", "Trade Corridors Improvement Fund 13\n", "Traffic Congestion Relief Fund 8\n", - "State-Local Partnership Program Acct 7\n", "Highway Safety,Rehabilitation,& Preservation Acct 7\n", - "Corridor Mobility Improvement Account 1\n", + "State-Local Partnership Program Acct 7\n", "Transportation Faciilities Account 1\n", + "Corridor Mobility Improvement Account 1\n", "Name: general_fund, dtype: int64" ] }, - "execution_count": 233, + "execution_count": 91, "metadata": {}, "output_type": "execute_result" } @@ -2262,8 +2239,8 @@ }, { "cell_type": "code", - "execution_count": 288, - "id": "1fb223cf", + "execution_count": 92, + "id": "cc3fbc10", "metadata": {}, "outputs": [], "source": [ @@ -2280,8 +2257,8 @@ }, { "cell_type": "code", - "execution_count": 235, - "id": "8a90d8e7", + "execution_count": 93, + "id": "f1f73846", "metadata": {}, "outputs": [ { @@ -2342,10 +2319,10 @@ " \n", " \n", " 4\n", - " 15A5(001)\n", + " 15A5(013)\n", " Federal Trust Fund\n", - " 849820.30\n", - " 700737.01\n", + " 172633.00\n", + " 0.00\n", " \n", " \n", "\n", @@ -2357,10 +2334,10 @@ "1 0014(005) Federal Trust Fund 879983.23 879983.23\n", "2 0027(012) Federal Trust Fund 12830458.87 12830458.87\n", "3 0061(025) Federal Trust Fund 2595722.00 2595722.00\n", - "4 15A5(001) Federal Trust Fund 849820.30 700737.01" + "4 15A5(013) Federal Trust Fund 172633.00 0.00" ] }, - "execution_count": 235, + "execution_count": 93, "metadata": {}, "output_type": "execute_result" } @@ -2371,8 +2348,8 @@ }, { "cell_type": "code", - "execution_count": 236, - "id": "7beb8071", + "execution_count": 94, + "id": "6060dc89", "metadata": {}, "outputs": [], "source": [ @@ -2389,8 +2366,8 @@ }, { "cell_type": "code", - "execution_count": 237, - "id": "8feb664b", + "execution_count": 95, + "id": "108ba2d2", "metadata": {}, "outputs": [ { @@ -2446,9 +2423,9 @@ " \n", " \n", " 4\n", - " 15A5(001)\n", - " 849820.30\n", - " 700737.01\n", + " 15A5(013)\n", + " 172633.00\n", + " 0.00\n", " \n", " \n", "\n", @@ -2460,10 +2437,10 @@ "1 0014(005) 879983.23 879983.23\n", "2 0027(012) 12830458.87 12830458.87\n", "3 0061(025) 2595722.00 2595722.00\n", - "4 15A5(001) 849820.30 700737.01" + "4 15A5(013) 172633.00 0.00" ] }, - "execution_count": 237, + "execution_count": 95, "metadata": {}, "output_type": "execute_result" } @@ -2474,17 +2451,17 @@ }, { "cell_type": "code", - "execution_count": 238, - "id": "694c3ea3", + "execution_count": 96, + "id": "499a97b5", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4305" + "4330" ] }, - "execution_count": 238, + "execution_count": 96, "metadata": {}, "output_type": "execute_result" } @@ -2495,8 +2472,8 @@ }, { "cell_type": "code", - "execution_count": 239, - "id": "def28835", + "execution_count": 97, + "id": "ad620b44", "metadata": {}, "outputs": [], "source": [ @@ -2508,8 +2485,8 @@ }, { "cell_type": "code", - "execution_count": 240, - "id": "e32b1394", + "execution_count": 98, + "id": "4b8ac706", "metadata": {}, "outputs": [], "source": [ @@ -2518,8 +2495,8 @@ }, { "cell_type": "code", - "execution_count": 241, - "id": "6647d17e", + "execution_count": 99, + "id": "5dc3d5e0", "metadata": {}, "outputs": [], "source": [ @@ -2528,18 +2505,18 @@ }, { "cell_type": "code", - "execution_count": 242, - "id": "5f806e1c", + "execution_count": 100, + "id": "e53fe275", "metadata": {}, "outputs": [], "source": [ - "fund_phase_df_pivot1 = to_snakecase(fund_phase_df_pivot1)" + "fund_phase_df_pivot1 = _utils.to_snakecase(fund_phase_df_pivot1)" ] }, { "cell_type": "code", - "execution_count": 243, - "id": "871c217b", + "execution_count": 101, + "id": "4491ab40", "metadata": {}, "outputs": [], "source": [ @@ -2549,8 +2526,8 @@ }, { "cell_type": "code", - "execution_count": 244, - "id": "52c83028", + "execution_count": 102, + "id": "0e29b5dc", "metadata": {}, "outputs": [ { @@ -2572,7 +2549,7 @@ " 'transportation_investment_fund']" ] }, - "execution_count": 244, + "execution_count": 102, "metadata": {}, "output_type": "execute_result" } @@ -2583,8 +2560,8 @@ }, { "cell_type": "code", - "execution_count": 245, - "id": "2139367b", + "execution_count": 103, + "id": "65359dc1", "metadata": {}, "outputs": [], "source": [ @@ -2594,8 +2571,8 @@ }, { "cell_type": "code", - "execution_count": 246, - "id": "590778fd", + "execution_count": 104, + "id": "e0568769", "metadata": {}, "outputs": [], "source": [ @@ -2605,8 +2582,8 @@ }, { "cell_type": "code", - "execution_count": 247, - "id": "2f7f8d5d", + "execution_count": 105, + "id": "5c6f2da7", "metadata": {}, "outputs": [], "source": [ @@ -2616,8 +2593,8 @@ }, { "cell_type": "code", - "execution_count": 248, - "id": "84f661b8", + "execution_count": 106, + "id": "efd1406e", "metadata": {}, "outputs": [], "source": [ @@ -2626,8 +2603,8 @@ }, { "cell_type": "code", - "execution_count": 249, - "id": "b01be19d", + "execution_count": 107, + "id": "de4dd1bf", "metadata": {}, "outputs": [], "source": [ @@ -2638,8 +2615,8 @@ }, { "cell_type": "code", - "execution_count": 250, - "id": "b92aa697", + "execution_count": 108, + "id": "03047890", "metadata": {}, "outputs": [], "source": [ @@ -2649,17 +2626,17 @@ }, { "cell_type": "code", - "execution_count": 251, - "id": "feccd5ab", + "execution_count": 109, + "id": "d81411fd", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(4305, 20)" + "(4330, 20)" ] }, - "execution_count": 251, + "execution_count": 109, "metadata": {}, "output_type": "execute_result" } @@ -2670,17 +2647,17 @@ }, { "cell_type": "code", - "execution_count": 252, - "id": "c6b364da", + "execution_count": 110, + "id": "e98e4b0a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "4305" + "4330" ] }, - "execution_count": 252, + "execution_count": 110, "metadata": {}, "output_type": "execute_result" } @@ -2691,357 +2668,63 @@ }, { "cell_type": "code", - "execution_count": 253, - "id": "49267b9b", + "execution_count": 111, + "id": "417a220f", "metadata": {}, "outputs": [], "source": [ - "# Tag whether something is funded by state/federal/both\n", - "def is_state_funds(row):\n", - " if row.total_state_funds > 0:\n", - " return \"Yes\"\n", - " else:\n", - " return \"No\"" + "final_fund_phase_df[\"is_state\"] = final_fund_phase_df.apply(_utils.is_state_funds, axis=1)" ] }, { "cell_type": "code", - "execution_count": 254, - "id": "bede5e25", + "execution_count": 112, + "id": "04ad9e81", "metadata": {}, "outputs": [], "source": [ - "def is_fed_funds(row):\n", - " if row.total_federal_funds > 0:\n", - " return \"Yes\"\n", - " else:\n", - " return \"No\"" + "final_fund_phase_df[\"is_federal\"] = final_fund_phase_df.apply(_utils.is_fed_funds, axis=1)" ] }, { "cell_type": "code", - "execution_count": 255, - "id": "a08995e3", + "execution_count": 113, + "id": "e6911d49", "metadata": {}, "outputs": [], "source": [ - "final_fund_phase_df[\"is_state\"] = final_fund_phase_df.apply(is_state_funds, axis=1)" + "final_fund_phase_df = final_fund_phase_df.fillna(0)" ] }, { "cell_type": "code", - "execution_count": 256, - "id": "13f768cb", + "execution_count": 114, + "id": "85bdef30", "metadata": {}, "outputs": [], "source": [ - "final_fund_phase_df[\"is_federal\"] = final_fund_phase_df.apply(is_fed_funds, axis=1)" + "to_keep = ['project_id', 'single_phase_cost',\n", + " 'single_phase_expenditure_amt', 'total_state_funds','total_federal_funds', 'is_state',\n", + " 'is_federal']" ] }, { "cell_type": "code", - "execution_count": 257, - "id": "bd42f0c3", + "execution_count": 115, + "id": "9a936952", "metadata": {}, "outputs": [], "source": [ - "final_fund_phase_df = final_fund_phase_df.fillna(0)" - ] - }, - { - "cell_type": "markdown", - "id": "a8e168da", - "metadata": {}, - "source": [ - "### Double Checking\n", - "* Make sure the project flag is correct" - ] - }, - { - "cell_type": "code", - "execution_count": 258, - "id": "c7651df6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "6211(130) 32\n", - "5908(031) 28\n", - "6053(130) 27\n", - "6211(131) 27\n", - "5006(219) 23\n", - "Name: project_id, dtype: int64" - ] - }, - "execution_count": 258, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fund_phase_df.project_id.value_counts().head()" - ] - }, - { - "cell_type": "code", - "execution_count": 259, - "id": "ca87e3a0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_id corridor_mobility_improvement_account environmental_enhanc_&_mitigat_prgm_fd federal_trust_fund highway_safety,rehabilitation,&_preservation_acct local_bridge_seismic_retrofit_acct road_maintenance_&_rehabilitation_account,_stf state_highway_account state-local_partnership_program_acct trade_corridor_enhancement_account,_stf trade_corridors_improvement_fund traffic_congestion_relief_fund transportation_deferred_investment_fund transportation_faciilities_account transportation_investment_fund unknown total_state_funds single_phase_cost single_phase_expenditure_amt total_federal_funds is_state is_federal
32095944(068)0.0000000.0000005412383.3900000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.00000085000.0000000.00000085000.0000005497383.3900005497383.3900005412383.390000YesYes
" - ], - "text/plain": [ - "" - ] - }, - "execution_count": 259, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].style.where(lambda val: 'Yes' in str(val), 'color: red')" - ] - }, - { - "cell_type": "code", - "execution_count": 260, - "id": "f3b59b90", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
adv_project_idfund_codepec_codeappropriation_category_codecurr_bud_amcash_exp_ampect_task_codeproject_idgeneral_fund
38040500000588089020300100203630485.13630485.133005944(068)Federal Trust Fund
3805050000058830082030600050685000.0085000.006205944(068)Transportation Investment Fund
38060500000588089020300100910809514.72809514.723005944(068)Federal Trust Fund
380705000005880890203001012131001729.001001729.003005944(068)Federal Trust Fund
380805000005880890203001015162970654.542970654.543005944(068)Federal Trust Fund
\n", - "
" - ], - "text/plain": [ - " adv_project_id fund_code pec_code appropriation_category_code \\\n", - "3804 0500000588 0890 2030010 0203 \n", - "3805 0500000588 3008 2030600 0506 \n", - "3806 0500000588 0890 2030010 0910 \n", - "3807 0500000588 0890 2030010 1213 \n", - "3808 0500000588 0890 2030010 1516 \n", - "\n", - " curr_bud_am cash_exp_am pect_task_code project_id \\\n", - "3804 630485.13 630485.13 300 5944(068) \n", - "3805 85000.00 85000.00 620 5944(068) \n", - "3806 809514.72 809514.72 300 5944(068) \n", - "3807 1001729.00 1001729.00 300 5944(068) \n", - "3808 2970654.54 2970654.54 300 5944(068) \n", - "\n", - " general_fund \n", - "3804 Federal Trust Fund \n", - "3805 Transportation Investment Fund \n", - "3806 Federal Trust Fund \n", - "3807 Federal Trust Fund \n", - "3808 Federal Trust Fund " - ] - }, - "execution_count": 260, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fund_phase_df.loc[fund_phase_df.project_id == '5944(068)']" - ] - }, - { - "cell_type": "code", - "execution_count": 261, - "id": "c6465aa8", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "curr_bud_am 5412383.39\n", - "dtype: float64" - ] - }, - "execution_count": 261, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fund_phase_df.loc[(fund_phase_df.project_id == '5944(068)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 262, - "id": "1a4ae929", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "curr_bud_am 5497383.39\n", - "dtype: float64" - ] - }, - "execution_count": 262, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fund_phase_df.loc[fund_phase_df.project_id == '5944(068)'][['curr_bud_am']].sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 263, - "id": "5dcaa987", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "3209 5497383.39\n", - "dtype: float64" - ] - }, - "execution_count": 263, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].total_state_funds + final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].federal_trust_fund" + "final_fund_phase_df2 = final_fund_phase_df[to_keep]" ] }, { "cell_type": "code", - "execution_count": 264, - "id": "88142d1f", - "metadata": {}, + "execution_count": 116, + "id": "41b0c099", + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -3090,13 +2773,13 @@ " \n", " \n", " \n", - " 383\n", - " 5006(219)\n", + " 55\n", + " 18D3(041)\n", + " 0.00\n", + " 0.00\n", " 0.00\n", " 0.00\n", - " 32967253.86\n", " 0.00\n", - " 229400.00\n", " 0.00\n", " 0.00\n", " 0.00\n", @@ -3107,525 +2790,209 @@ " 0.00\n", " 0.00\n", " 0.00\n", - " 229400.00\n", - " 33196653.86\n", - " 32534546.43\n", - " 32967253.86\n", - " Yes\n", - " Yes\n", - " \n", - " \n", - "\n", - "" - ], - "text/plain": [ - " project_id corridor_mobility_improvement_account \\\n", - "383 5006(219) 0.00 \n", - "\n", - " environmental_enhanc_&_mitigat_prgm_fd federal_trust_fund \\\n", - "383 0.00 32967253.86 \n", - "\n", - " highway_safety,rehabilitation,&_preservation_acct \\\n", - "383 0.00 \n", - "\n", - " local_bridge_seismic_retrofit_acct \\\n", - "383 229400.00 \n", - "\n", - " road_maintenance_&_rehabilitation_account,_stf state_highway_account \\\n", - "383 0.00 0.00 \n", - "\n", - " state-local_partnership_program_acct \\\n", - "383 0.00 \n", - "\n", - " trade_corridor_enhancement_account,_stf \\\n", - "383 0.00 \n", - "\n", - " trade_corridors_improvement_fund traffic_congestion_relief_fund \\\n", - "383 0.00 0.00 \n", - "\n", - " transportation_deferred_investment_fund \\\n", - "383 0.00 \n", - "\n", - " transportation_faciilities_account transportation_investment_fund \\\n", - "383 0.00 0.00 \n", - "\n", - " unknown total_state_funds single_phase_cost \\\n", - "383 0.00 229400.00 33196653.86 \n", - "\n", - " single_phase_expenditure_amt total_federal_funds is_state is_federal \n", - "383 32534546.43 32967253.86 Yes Yes " - ] - }, - "execution_count": 264, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "final_fund_phase_df.loc[final_fund_phase_df.project_id == '5006(219)']" - ] - }, - { - "cell_type": "code", - "execution_count": 265, - "id": "61f4652c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "curr_bud_am 32967253.86\n", - "dtype: float64" - ] - }, - "execution_count": 265, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "fund_phase_df.loc[(fund_phase_df.project_id == '5006(219)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 266, - "id": "72e610b6", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "33196653.86" - ] - }, - "execution_count": 266, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "229400.00 + 32967253.86" - ] - }, - { - "cell_type": "code", - "execution_count": 267, - "id": "9c4e34c0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
adv_project_idfund_codepec_codeappropriation_category_codecurr_bud_amcash_exp_ampect_task_codeproject_idgeneral_fund
573407000011580890203001005061000000.001000000.008105006(219)Federal Trust Fund0.000.000.000.00NoNo
57350700001158089020300101415572006(034)0.000.003005006(219)Federal Trust Fund
573607000011580890203001008091691542.001691542.008105006(219)Federal Trust Fund
5737070000115808902030010101125448.4225448.428105006(219)Federal Trust Fund
5738070000115808902030010111220206009.5420206009.543005006(219)Federal Trust Fund
573907000011580890203001013143216979.123216979.123005006(219)Federal Trust Fund
57400700001158089020300101415154672.27154672.273005006(219)Federal Trust Fund
57410700001158089020300101617608787.00294068.823005006(219)Federal Trust Fund
574207000011580890203001019201876299.001653630.553005006(219)Federal Trust Fund
57430700001158089020300101819106000.0094331.533005006(219)Federal Trust Fund
57440700001158089020300101819626000.00512947.673005006(219)Federal Trust Fund
5745070000115808902030010101120985.5220985.523005006(219)Federal Trust Fund
5746070000115808902030010101180000.0080000.003005006(219)Federal Trust Fund
57470700001158089020300101011210932.42210932.423005006(219)Federal Trust Fund
574807000011580890203001010111338648.661338648.663005006(219)Federal Trust Fund
5749070000115808902030010101119384.5719384.573005006(219)Federal Trust Fund
5750070000115808902030010141539345.3639345.363005006(219)Federal Trust Fund
57510700001158089020300101112516666.98516666.988105006(219)Federal Trust Fund
575207000011580890203001013140.000.008105006(219)Federal Trust Fund
575307000011580890203001011120.000.003005006(219)Federal Trust Fund0.000.000.000.000.000.000.000.000.000.000.000.000.00NoNo
57540700001158089020300101213392119.00392119.003005006(219)Federal Trust Fund582006(048)0.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00NoNo
57550700001158089020300101415837434.00837434.003005006(219)Federal Trust Fund592006(049)0.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00NoNo
57560700001158606220300101112229400.00229400.006905006(219)Local Bridge Seismic Retrofit Acct602006(053)0.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.000.00NoNo
\n", "
" ], "text/plain": [ - " adv_project_id fund_code pec_code appropriation_category_code \\\n", - "5734 0700001158 0890 2030010 0506 \n", - "5735 0700001158 0890 2030010 1415 \n", - "5736 0700001158 0890 2030010 0809 \n", - "5737 0700001158 0890 2030010 1011 \n", - "5738 0700001158 0890 2030010 1112 \n", - "5739 0700001158 0890 2030010 1314 \n", - "5740 0700001158 0890 2030010 1415 \n", - "5741 0700001158 0890 2030010 1617 \n", - "5742 0700001158 0890 2030010 1920 \n", - "5743 0700001158 0890 2030010 1819 \n", - "5744 0700001158 0890 2030010 1819 \n", - "5745 0700001158 0890 2030010 1011 \n", - "5746 0700001158 0890 2030010 1011 \n", - "5747 0700001158 0890 2030010 1011 \n", - "5748 0700001158 0890 2030010 1011 \n", - "5749 0700001158 0890 2030010 1011 \n", - "5750 0700001158 0890 2030010 1415 \n", - "5751 0700001158 0890 2030010 1112 \n", - "5752 0700001158 0890 2030010 1314 \n", - "5753 0700001158 0890 2030010 1112 \n", - "5754 0700001158 0890 2030010 1213 \n", - "5755 0700001158 0890 2030010 1415 \n", - "5756 0700001158 6062 2030010 1112 \n", - "\n", - " curr_bud_am cash_exp_am pect_task_code project_id \\\n", - "5734 1000000.00 1000000.00 810 5006(219) \n", - "5735 0.00 0.00 300 5006(219) \n", - "5736 1691542.00 1691542.00 810 5006(219) \n", - "5737 25448.42 25448.42 810 5006(219) \n", - "5738 20206009.54 20206009.54 300 5006(219) \n", - "5739 3216979.12 3216979.12 300 5006(219) \n", - "5740 154672.27 154672.27 300 5006(219) \n", - "5741 608787.00 294068.82 300 5006(219) \n", - "5742 1876299.00 1653630.55 300 5006(219) \n", - "5743 106000.00 94331.53 300 5006(219) \n", - "5744 626000.00 512947.67 300 5006(219) \n", - "5745 20985.52 20985.52 300 5006(219) \n", - "5746 80000.00 80000.00 300 5006(219) \n", - "5747 210932.42 210932.42 300 5006(219) \n", - "5748 1338648.66 1338648.66 300 5006(219) \n", - "5749 19384.57 19384.57 300 5006(219) \n", - "5750 39345.36 39345.36 300 5006(219) \n", - "5751 516666.98 516666.98 810 5006(219) \n", - "5752 0.00 0.00 810 5006(219) \n", - "5753 0.00 0.00 300 5006(219) \n", - "5754 392119.00 392119.00 300 5006(219) \n", - "5755 837434.00 837434.00 300 5006(219) \n", - "5756 229400.00 229400.00 690 5006(219) \n", - "\n", - " general_fund \n", - "5734 Federal Trust Fund \n", - "5735 Federal Trust Fund \n", - "5736 Federal Trust Fund \n", - "5737 Federal Trust Fund \n", - "5738 Federal Trust Fund \n", - "5739 Federal Trust Fund \n", - "5740 Federal Trust Fund \n", - "5741 Federal Trust Fund \n", - "5742 Federal Trust Fund \n", - "5743 Federal Trust Fund \n", - "5744 Federal Trust Fund \n", - "5745 Federal Trust Fund \n", - "5746 Federal Trust Fund \n", - "5747 Federal Trust Fund \n", - "5748 Federal Trust Fund \n", - "5749 Federal Trust Fund \n", - "5750 Federal Trust Fund \n", - "5751 Federal Trust Fund \n", - "5752 Federal Trust Fund \n", - "5753 Federal Trust Fund \n", - "5754 Federal Trust Fund \n", - "5755 Federal Trust Fund \n", - "5756 Local Bridge Seismic Retrofit Acct " + " project_id corridor_mobility_improvement_account \\\n", + "55 18D3(041) 0.00 \n", + "57 2006(034) 0.00 \n", + "58 2006(048) 0.00 \n", + "59 2006(049) 0.00 \n", + "60 2006(053) 0.00 \n", + "\n", + " environmental_enhanc_&_mitigat_prgm_fd federal_trust_fund \\\n", + "55 0.00 0.00 \n", + "57 0.00 0.00 \n", + "58 0.00 0.00 \n", + "59 0.00 0.00 \n", + "60 0.00 0.00 \n", + "\n", + " highway_safety,rehabilitation,&_preservation_acct \\\n", + "55 0.00 \n", + "57 0.00 \n", + "58 0.00 \n", + "59 0.00 \n", + "60 0.00 \n", + "\n", + " local_bridge_seismic_retrofit_acct \\\n", + "55 0.00 \n", + "57 0.00 \n", + "58 0.00 \n", + "59 0.00 \n", + "60 0.00 \n", + "\n", + " road_maintenance_&_rehabilitation_account,_stf state_highway_account \\\n", + "55 0.00 0.00 \n", + "57 0.00 0.00 \n", + "58 0.00 0.00 \n", + "59 0.00 0.00 \n", + "60 0.00 0.00 \n", + "\n", + " state-local_partnership_program_acct \\\n", + "55 0.00 \n", + "57 0.00 \n", + "58 0.00 \n", + "59 0.00 \n", + "60 0.00 \n", + "\n", + " trade_corridor_enhancement_account,_stf trade_corridors_improvement_fund \\\n", + "55 0.00 0.00 \n", + "57 0.00 0.00 \n", + "58 0.00 0.00 \n", + "59 0.00 0.00 \n", + "60 0.00 0.00 \n", + "\n", + " traffic_congestion_relief_fund transportation_deferred_investment_fund \\\n", + "55 0.00 0.00 \n", + "57 0.00 0.00 \n", + "58 0.00 0.00 \n", + "59 0.00 0.00 \n", + "60 0.00 0.00 \n", + "\n", + " transportation_faciilities_account transportation_investment_fund \\\n", + "55 0.00 0.00 \n", + "57 0.00 0.00 \n", + "58 0.00 0.00 \n", + "59 0.00 0.00 \n", + "60 0.00 0.00 \n", + "\n", + " unknown total_state_funds single_phase_cost \\\n", + "55 0.00 0.00 0.00 \n", + "57 0.00 0.00 0.00 \n", + "58 0.00 0.00 0.00 \n", + "59 0.00 0.00 0.00 \n", + "60 0.00 0.00 0.00 \n", + "\n", + " single_phase_expenditure_amt total_federal_funds is_state is_federal \n", + "55 0.00 0.00 No No \n", + "57 0.00 0.00 No No \n", + "58 0.00 0.00 No No \n", + "59 0.00 0.00 No No \n", + "60 0.00 0.00 No No " ] }, - "execution_count": 267, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "fund_phase_df.loc[fund_phase_df.project_id == '5006(219)']" - ] - }, - { - "cell_type": "markdown", - "id": "7f0ddf53", - "metadata": {}, - "source": [ - "## Awards Table\n", - "* Appropriation code is the fiscal year of award\n" + "final_fund_phase_df.loc[(final_fund_phase_df.is_state == \"No\") & (final_fund_phase_df.is_federal == \"No\")].head()" ] }, { "cell_type": "code", - "execution_count": 268, - "id": "f6e5a5a7", + "execution_count": 117, + "id": "2f222fad", "metadata": {}, "outputs": [ { @@ -3649,185 +3016,189 @@ " \n", " \n", " \n", - " adv_project_id\n", - " fund_code\n", - " pec_code\n", - " appropriation_category_code\n", - " curr_bud_am\n", - " cash_exp_am\n", - " pect_task_code\n", + " \n", " project_id\n", - " pec\n", - " pect\n", - " pect_description\n", - " program\n", + " \n", + " \n", + " is_state\n", + " is_federal\n", + " \n", " \n", " \n", " \n", " \n", - " 858\n", - " 0214000121\n", - " 0890\n", - " 2030010\n", - " 1617\n", - " 762938.00\n", - " 435821.60\n", - " 560\n", - " 5905(099)\n", - " 2030010\n", - " 560.00\n", - " High Risk Rural Roads Program (HR3)\n", - " Local Assistance\n", + " No\n", + " No\n", + " 448\n", + " \n", + " \n", + " Yes\n", + " 2500\n", + " \n", + " \n", + " Yes\n", + " No\n", + " 1247\n", + " \n", + " \n", + " Yes\n", + " 135\n", " \n", " \n", "\n", "" ], "text/plain": [ - " adv_project_id fund_code pec_code appropriation_category_code \\\n", - "858 0214000121 0890 2030010 1617 \n", - "\n", - " curr_bud_am cash_exp_am pect_task_code project_id pec pect \\\n", - "858 762938.00 435821.60 560 5905(099) 2030010 560.00 \n", - "\n", - " pect_description program \n", - "858 High Risk Rural Roads Program (HR3) Local Assistance " + " project_id\n", + "is_state is_federal \n", + "No No 448\n", + " Yes 2500\n", + "Yes No 1247\n", + " Yes 135" ] }, - "execution_count": 268, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "pect_df.sample()" + "final_fund_phase_df.groupby(['is_state', 'is_federal']).agg({'project_id':'nunique'})" + ] + }, + { + "cell_type": "markdown", + "id": "31cc5696", + "metadata": {}, + "source": [ + "### Double Checking\n", + "* Make sure the project flag is correct" ] }, { "cell_type": "code", - "execution_count": 269, - "id": "51f10795", + "execution_count": 118, + "id": "e71ef88d", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
adv_project_idfund_codepec_codeappropriation_category_codecurr_bud_amcash_exp_ampect_task_codeproject_id
59280700020294089020300101112554663.00554663.006905953(650)
\n", - "
" - ], - "text/plain": [ - " adv_project_id fund_code pec_code appropriation_category_code \\\n", - "5928 0700020294 0890 2030010 1112 \n", - "\n", - " curr_bud_am cash_exp_am pect_task_code project_id \n", - "5928 554663.00 554663.00 690 5953(650) " - ] - }, - "execution_count": 269, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "accounting_df.sample()" + "# fund_phase_df.project_id.value_counts().head()" ] }, { "cell_type": "code", - "execution_count": 270, - "id": "e33565a3", + "execution_count": 119, + "id": "87447a97", "metadata": {}, "outputs": [], "source": [ - "# Only want the most recent year of a pec_code listed once\n", - "awards_df = (pect_df\n", - " .groupby(['project_id', 'program'])\n", - " .agg({'appropriation_category_code':'max'})\n", - " .reset_index()\n", - " .rename(columns = {'appropriation_category_code':'state_fiscal_awarded_year',\n", - " 'program':'grant_program'})\n", - " )" + "# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].style.where(lambda val: 'Yes' in str(val), 'color: red')" ] }, { - "cell_type": "markdown", - "id": "edbc2079", + "cell_type": "code", + "execution_count": 120, + "id": "ff83f49b", "metadata": {}, + "outputs": [], "source": [ - "## Checks" + "# fund_phase_df.loc[fund_phase_df.project_id == '5944(068)']" ] }, { "cell_type": "code", - "execution_count": 271, - "id": "bd775031", + "execution_count": 121, + "id": "f8a7cc78", + "metadata": {}, + "outputs": [], + "source": [ + "# fund_phase_df.loc[(fund_phase_df.project_id == '5944(068)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "f8d7e6c6", + "metadata": {}, + "outputs": [], + "source": [ + "# fund_phase_df.loc[fund_phase_df.project_id == '5944(068)'][['curr_bud_am']].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "ef5bdbcc", + "metadata": {}, + "outputs": [], + "source": [ + "# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].total_state_funds + final_fund_phase_df.loc[final_fund_phase_df.project_id == '5944(068)'].federal_trust_fund" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "id": "bbce5cbd", + "metadata": {}, + "outputs": [], + "source": [ + "# final_fund_phase_df.loc[final_fund_phase_df.project_id == '5006(219)']" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "id": "f61fbd0b", + "metadata": {}, + "outputs": [], + "source": [ + "# fund_phase_df.loc[(fund_phase_df.project_id == '5006(219)')&(fund_phase_df.general_fund == 'Federal Trust Fund')][['curr_bud_am']].sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "9eb9c539", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "5182(058) 3\n", - "5288(046) 3\n", - "5475(038) 3\n", - "6066(140) 3\n", - "6090(059) 3\n", - "Name: project_id, dtype: int64" + "33196653.86" ] }, - "execution_count": 271, + "execution_count": 126, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "awards_df.project_id.value_counts().head()" + "229400.00 + 32967253.86" + ] + }, + { + "cell_type": "code", + "execution_count": 127, + "id": "a12dbb0c", + "metadata": {}, + "outputs": [], + "source": [ + "# fund_phase_df.loc[fund_phase_df.project_id == '5006(219)']" + ] + }, + { + "cell_type": "markdown", + "id": "46c290bd", + "metadata": {}, + "source": [ + "## Awards Table\n", + "* Appropriation code is the fiscal year of award\n" ] }, { "cell_type": "code", - "execution_count": 272, - "id": "d0b1444e", + "execution_count": 128, + "id": "8d2670d1", "metadata": {}, "outputs": [ { @@ -3851,64 +3222,67 @@ " \n", " \n", " \n", + " adv_project_id\n", + " fund_code\n", + " pec_code\n", + " appropriation_category_code\n", + " curr_bud_am\n", + " cash_exp_am\n", + " pect_task_code\n", " project_id\n", - " grant_program\n", - " state_fiscal_awarded_year\n", + " pec\n", + " pect\n", + " pect_description\n", + " program\n", " \n", " \n", " \n", " \n", - " 1546\n", - " 5182(058)\n", - " Active Transportation Program (ATP)\n", - " 2223\n", - " \n", - " \n", - " 1547\n", - " 5182(058)\n", - " Local Assistance\n", - " 2223\n", - " \n", - " \n", - " 1548\n", - " 5182(058)\n", - " Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017\n", - " 2122\n", + " 8504\n", + " 1013000080\n", + " 0042\n", + " 2030600\n", + " 1213\n", + " 25000.00\n", + " 25000.00\n", + " 621\n", + " 5940(103)\n", + " 2030600\n", + " 621.00\n", + " Local Roads Rehabilitation\n", + " State Transportation Improvement Program (STIP)\n", " \n", " \n", "\n", "" ], "text/plain": [ - " project_id \\\n", - "1546 5182(058) \n", - "1547 5182(058) \n", - "1548 5182(058) \n", + " adv_project_id fund_code pec_code appropriation_category_code \\\n", + "8504 1013000080 0042 2030600 1213 \n", "\n", - " grant_program \\\n", - "1546 Active Transportation Program (ATP) \n", - "1547 Local Assistance \n", - "1548 Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017 \n", + " curr_bud_am cash_exp_am pect_task_code project_id pec pect \\\n", + "8504 25000.00 25000.00 621 5940(103) 2030600 621.00 \n", "\n", - " state_fiscal_awarded_year \n", - "1546 2223 \n", - "1547 2223 \n", - "1548 2122 " + " pect_description \\\n", + "8504 Local Roads Rehabilitation \n", + "\n", + " program \n", + "8504 State Transportation Improvement Program (STIP) " ] }, - "execution_count": 272, + "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "awards_df.loc[awards_df.project_id == \"5182(058)\"]" + "pect_df.sample()" ] }, { "cell_type": "code", - "execution_count": 273, - "id": "c7feef8d", + "execution_count": 129, + "id": "d48c1f0f", "metadata": {}, "outputs": [ { @@ -3940,132 +3314,19 @@ " cash_exp_am\n", " pect_task_code\n", " project_id\n", - " pec\n", - " pect\n", - " pect_description\n", - " program\n", " \n", " \n", " \n", " \n", - " 1412\n", - " 0312000145\n", - " 0890\n", - " 2030720\n", - " 2223\n", - " 4318000.00\n", - " 0.00\n", - " 100\n", - " 5182(058)\n", - " 2030720\n", - " 100.00\n", - " Active Transportation Program (ATP)\n", - " Active Transportation Program (ATP)\n", - " \n", - " \n", - " 1413\n", - " 0312000145\n", - " 0042\n", - " 2030210\n", - " 2122\n", - " 6239000.00\n", - " 0.00\n", - " 350\n", - " 5182(058)\n", - " 2030210\n", - " 350.00\n", - " Solutions for Congested Corridors Program (SCCP)\n", - " Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017\n", - " \n", - " \n", - " 1414\n", - " 0312000145\n", - " 0890\n", - " 2030010\n", - " 1011\n", - " 456704.00\n", - " 456704.00\n", - " 820\n", - " 5182(058)\n", - " 2030010\n", - " 820.00\n", - " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", - " Local Assistance\n", - " \n", - " \n", - " 1415\n", - " 0312000145\n", - " 0890\n", - " 2030010\n", - " 1112\n", - " 0.00\n", - " 0.00\n", - " 820\n", - " 5182(058)\n", - " 2030010\n", - " 820.00\n", - " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", - " Local Assistance\n", - " \n", - " \n", - " 1416\n", - " 0312000145\n", - " 0890\n", - " 2030010\n", - " 1213\n", - " 54423.24\n", - " 54423.24\n", - " 820\n", - " 5182(058)\n", - " 2030010\n", - " 820.00\n", - " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", - " Local Assistance\n", - " \n", - " \n", - " 1417\n", - " 0312000145\n", - " 0890\n", - " 2030010\n", - " 1920\n", - " 50000.00\n", - " 13000.00\n", - " 820\n", - " 5182(058)\n", - " 2030010\n", - " 820.00\n", - " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", - " Local Assistance\n", - " \n", - " \n", - " 1418\n", - " 0312000145\n", + " 6581\n", + " 0718000255\n", " 0890\n", " 2030010\n", - " 2223\n", - " 333821.00\n", + " 2122\n", + " 1238310.00\n", " 0.00\n", - " 820\n", - " 5182(058)\n", - " 2030010\n", - " 820.00\n", - " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", - " Local Assistance\n", - " \n", - " \n", - " 1419\n", - " 0312000145\n", - " 0890\n", - " 2030010\n", - " 1516\n", - " 34991.76\n", - " 34991.76\n", - " 820\n", - " 5182(058)\n", - " 2030010\n", - " 820.00\n", - " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", - " Local Assistance\n", + " 550\n", + " 5257(037)\n", " \n", " \n", "\n", @@ -4073,159 +3334,76 @@ ], "text/plain": [ " adv_project_id fund_code pec_code appropriation_category_code \\\n", - "1412 0312000145 0890 2030720 2223 \n", - "1413 0312000145 0042 2030210 2122 \n", - "1414 0312000145 0890 2030010 1011 \n", - "1415 0312000145 0890 2030010 1112 \n", - "1416 0312000145 0890 2030010 1213 \n", - "1417 0312000145 0890 2030010 1920 \n", - "1418 0312000145 0890 2030010 2223 \n", - "1419 0312000145 0890 2030010 1516 \n", - "\n", - " curr_bud_am cash_exp_am pect_task_code project_id pec pect \\\n", - "1412 4318000.00 0.00 100 5182(058) 2030720 100.00 \n", - "1413 6239000.00 0.00 350 5182(058) 2030210 350.00 \n", - "1414 456704.00 456704.00 820 5182(058) 2030010 820.00 \n", - "1415 0.00 0.00 820 5182(058) 2030010 820.00 \n", - "1416 54423.24 54423.24 820 5182(058) 2030010 820.00 \n", - "1417 50000.00 13000.00 820 5182(058) 2030010 820.00 \n", - "1418 333821.00 0.00 820 5182(058) 2030010 820.00 \n", - "1419 34991.76 34991.76 820 5182(058) 2030010 820.00 \n", - "\n", - " pect_description \\\n", - "1412 Active Transportation Program (ATP) \n", - "1413 Solutions for Congested Corridors Program (SCCP) \n", - "1414 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", - "1415 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", - "1416 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", - "1417 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", - "1418 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", - "1419 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", + "6581 0718000255 0890 2030010 2122 \n", "\n", - " program \n", - "1412 Active Transportation Program (ATP) \n", - "1413 Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017 \n", - "1414 Local Assistance \n", - "1415 Local Assistance \n", - "1416 Local Assistance \n", - "1417 Local Assistance \n", - "1418 Local Assistance \n", - "1419 Local Assistance " + " curr_bud_am cash_exp_am pect_task_code project_id \n", + "6581 1238310.00 0.00 550 5257(037) " ] }, - "execution_count": 273, + "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Check original df \n", - "pect_df.loc[pect_df.project_id == \"5182(058)\"]" + "accounting_df.sample()" ] }, { "cell_type": "code", - "execution_count": 274, - "id": "0bbac42d", + "execution_count": 130, + "id": "2c9cca16", + "metadata": {}, + "outputs": [], + "source": [ + "# Only want the most recent year of a pec_code listed once\n", + "awards_df = (pect_df\n", + " .groupby(['project_id', 'program'])\n", + " .agg({'appropriation_category_code':'max'})\n", + " .reset_index()\n", + " .rename(columns = {'appropriation_category_code':'state_fiscal_awarded_year',\n", + " 'program':'grant_program'})\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "3de7091b", + "metadata": {}, + "source": [ + "## Checks" + ] + }, + { + "cell_type": "code", + "execution_count": 131, + "id": "b7c63b95", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_id comment_desc district_code est_total_prj_costs location_name project_label_name original_post_mile_begin_id original_post_mile_end_id revised_post_mile_begin_ind revised_post_mile_end_ind route_name state_hwy_ind senate_district_code update_date_time agency_name urban_area_code county_name work_type_desc category_desc current_phase active_transportation_program_(atp) bridge_inspection_&_scour_evaluation covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation carbon_reduction_program_(crp) congestion_mitigation_&_air_quality_improvement_program_(cmaq) coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds corridor_mobility_improvement_account_(cmia)_program county_exchange_funds county_state_match_program earmarks_projects_(hpp,_demo_cpfcds,_etc.) emergency_relief_(er) ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program funds_for_planning,_programming_and_monitoring_-_rip general_funded_designated_programs hazard_elimination_safety_(hes) high_risk_rural_roads_program_(hr3) highway_bridge_ highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund highway_safety_improvement_program_(hsip)_(non-infrastructure) highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund local_partnership_program_(lpp_–_competitive)_ local_roads local_roads_rehabilitation railroad_grade_crossing_protection railroad_grade_separations rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_ regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system) regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip) regional_transportation_planning_agency_(rtpa)_stp_match_exchange sb1_funded_freeway_service_patrol shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds safe_routes_to_school_(sr2s_and_srts) set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act solutions_for_congested_corridors_program_(sccp) special_programs state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic) structures_seismic_retrofit_ trade_corridor_enhancement_account_(tcea)_programs_–_local_share trade_corridor_enhancement_account_(tcea)_programs_–_state_share trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads traffic_congestion_relief_program_(_tcrp_) unknown
78345288(046)Data Migrated from CTIPS :\r\n", - "The Project Planning Id are: 1785; \r\n", - "The locations are :In Folsom on White Rock Road in the vicinity of the Scott Road Intersection. Widen 1 mile of 4-lane roadway and signalize 1 Intersection.;\n", - "2/13/2020: This project is the same as STPL-6498(003). Agency is determining whether CMGC negotiations will be viable via the JPA and if not, project will be turned over to City of Folsom to implement/construct. CR\n", - "2/13/2020: This project is the same as \n", - "Project has $10,000 LPP and $15,000 RIP/STIP.\n", - "8/22/22: Cost adj to correct local funds to local AC $6,201,500. Erroneously entered as local funds in prior sequence.0325750000.000000In City of Folsom, on White Rock Road from Prairie City Road to East Bidwell Street.Construct 4 lane road with 8 foot shouldersnannanNoneNone0-FOLNNone2023-10-27 10:33:06Folsom3067Sacramento CountyNoneRoadway Wideningsingle phaseNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoYesYesNoNoNoNoNoYesNoNoNoNoNoNoNoNoNoNoNoNoNoNo
" - ], "text/plain": [ - "" + "6066(140) 3\n", + "5938(233) 3\n", + "5956(221) 3\n", + "5182(058) 3\n", + "5475(038) 3\n", + "Name: project_id, dtype: int64" ] }, - "execution_count": 274, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "project_df[project_df.project_id == \"5288(046)\"].style.where(lambda val: 'Yes' in str(val), 'color: red')" + "awards_df.project_id.value_counts().head()" ] }, { "cell_type": "code", - "execution_count": 275, - "id": "baf7d57e", + "execution_count": 132, + "id": "da2a3dcc", "metadata": {}, "outputs": [ { @@ -4256,20 +3434,20 @@ " \n", " \n", " \n", - " 2416\n", - " 5475(038)\n", + " 1550\n", + " 5182(058)\n", " Active Transportation Program (ATP)\n", " 2223\n", " \n", " \n", - " 2417\n", - " 5475(038)\n", + " 1551\n", + " 5182(058)\n", " Local Assistance\n", " 2223\n", " \n", " \n", - " 2418\n", - " 5475(038)\n", + " 1552\n", + " 5182(058)\n", " Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017\n", " 2122\n", " \n", @@ -4279,34 +3457,55 @@ ], "text/plain": [ " project_id \\\n", - "2416 5475(038) \n", - "2417 5475(038) \n", - "2418 5475(038) \n", + "1550 5182(058) \n", + "1551 5182(058) \n", + "1552 5182(058) \n", "\n", " grant_program \\\n", - "2416 Active Transportation Program (ATP) \n", - "2417 Local Assistance \n", - "2418 Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017 \n", + "1550 Active Transportation Program (ATP) \n", + "1551 Local Assistance \n", + "1552 Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017 \n", "\n", " state_fiscal_awarded_year \n", - "2416 2223 \n", - "2417 2223 \n", - "2418 2122 " + "1550 2223 \n", + "1551 2223 \n", + "1552 2122 " ] }, - "execution_count": 275, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "awards_df.loc[awards_df.project_id == \"5475(038)\"]" + "awards_df.loc[awards_df.project_id == \"5182(058)\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 133, + "id": "d6d426ce", + "metadata": {}, + "outputs": [], + "source": [ + "# Check original df \n", + "# pect_df.loc[pect_df.project_id == \"5182(058)\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 134, + "id": "887d4afd", + "metadata": {}, + "outputs": [], + "source": [ + "# project_df[project_df.project_id == \"5288(046)\"].style.where(lambda val: 'Yes' in str(val), 'color: red')" ] }, { "cell_type": "code", - "execution_count": 276, - "id": "b089062a", + "execution_count": 135, + "id": "8bb2e36c", "metadata": {}, "outputs": [ { @@ -4330,229 +3529,160 @@ " \n", " \n", " \n", - " adv_project_id\n", - " fund_code\n", - " pec_code\n", - " appropriation_category_code\n", - " curr_bud_am\n", - " cash_exp_am\n", - " pect_task_code\n", " project_id\n", - " pec\n", - " pect\n", - " pect_description\n", - " program\n", + " grant_program\n", + " state_fiscal_awarded_year\n", " \n", " \n", " \n", " \n", - " 1639\n", - " 0315000005\n", - " 0890\n", - " 2030720\n", - " 2223\n", - " 1512000.00\n", - " 0.00\n", - " 100\n", + " 2442\n", " 5475(038)\n", - " 2030720\n", - " 100.00\n", - " Active Transportation Program (ATP)\n", " Active Transportation Program (ATP)\n", + " 2223\n", " \n", " \n", - " 1640\n", - " 0315000005\n", - " 0042\n", - " 2030210\n", - " 2122\n", - " 2860000.00\n", - " 0.00\n", - " 350\n", - " 5475(038)\n", - " 2030210\n", - " 350.00\n", - " Solutions for Congested Corridors Program (SCCP)\n", - " Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017\n", - " \n", - " \n", - " 1641\n", - " 0315000005\n", - " 0890\n", - " 2030010\n", - " 1314\n", - " 1061999.97\n", - " 1061999.97\n", - " 820\n", - " 5475(038)\n", - " 2030010\n", - " 820.00\n", - " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", - " Local Assistance\n", - " \n", - " \n", - " 1642\n", - " 0315000005\n", - " 0890\n", - " 2030010\n", - " 1516\n", - " 2898000.00\n", - " 2898000.00\n", - " 820\n", + " 2443\n", " 5475(038)\n", - " 2030010\n", - " 820.00\n", - " Congestion Mitigation & Air Quality Improvement Program (CMAQ)\n", " Local Assistance\n", + " 2223\n", " \n", " \n", - " 1643\n", - " 0315000005\n", - " 0890\n", - " 2030010\n", - " 2223\n", - " 9552155.00\n", - " 0.00\n", - " 810\n", + " 2444\n", " 5475(038)\n", - " 2030010\n", - " 810.00\n", - " Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP)\n", - " Local Assistance\n", + " Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017\n", + " 2122\n", " \n", " \n", "\n", "" ], "text/plain": [ - " adv_project_id fund_code pec_code appropriation_category_code \\\n", - "1639 0315000005 0890 2030720 2223 \n", - "1640 0315000005 0042 2030210 2122 \n", - "1641 0315000005 0890 2030010 1314 \n", - "1642 0315000005 0890 2030010 1516 \n", - "1643 0315000005 0890 2030010 2223 \n", - "\n", - " curr_bud_am cash_exp_am pect_task_code project_id pec pect \\\n", - "1639 1512000.00 0.00 100 5475(038) 2030720 100.00 \n", - "1640 2860000.00 0.00 350 5475(038) 2030210 350.00 \n", - "1641 1061999.97 1061999.97 820 5475(038) 2030010 820.00 \n", - "1642 2898000.00 2898000.00 820 5475(038) 2030010 820.00 \n", - "1643 9552155.00 0.00 810 5475(038) 2030010 810.00 \n", + " project_id \\\n", + "2442 5475(038) \n", + "2443 5475(038) \n", + "2444 5475(038) \n", "\n", - " pect_description \\\n", - "1639 Active Transportation Program (ATP) \n", - "1640 Solutions for Congested Corridors Program (SCCP) \n", - "1641 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", - "1642 Congestion Mitigation & Air Quality Improvement Program (CMAQ) \n", - "1643 Regional Surface Transportation Block Grant Program (RSTBGP) and Highway Infrastructure Program (HIP) \n", + " grant_program \\\n", + "2442 Active Transportation Program (ATP) \n", + "2443 Local Assistance \n", + "2444 Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017 \n", "\n", - " program \n", - "1639 Active Transportation Program (ATP) \n", - "1640 Proposition 1B, Hwy Safety, Traffic Reduction, Air Quality , and Port Security Bond Act of 2006, and SB 1: The Road Repair and Accountability Act of 2017 \n", - "1641 Local Assistance \n", - "1642 Local Assistance \n", - "1643 Local Assistance " + " state_fiscal_awarded_year \n", + "2442 2223 \n", + "2443 2223 \n", + "2444 2122 " ] }, - "execution_count": 276, + "execution_count": 135, "metadata": {}, "output_type": "execute_result" } ], + "source": [ + "awards_df.loc[awards_df.project_id == \"5475(038)\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "id": "42649961", + "metadata": {}, + "outputs": [], "source": [ "# Check original df \n", - "pect_df.loc[pect_df.project_id == \"5475(038)\"]" + "# pect_df.loc[pect_df.project_id == \"5475(038)\"]" ] }, { "cell_type": "code", - "execution_count": 277, - "id": "e6406250", + "execution_count": 137, + "id": "ebc41d72", "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", + " }
project_id comment_desc district_code est_total_prj_costs location_name project_label_name original_post_mile_begin_id original_post_mile_end_id revised_post_mile_begin_ind revised_post_mile_end_ind route_name state_hwy_ind senate_district_code update_date_time agency_name urban_area_code county_name work_type_desc category_desc current_phase active_transportation_program_(atp) bridge_inspection_&_scour_evaluation covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation carbon_reduction_program_(crp) congestion_mitigation_&_air_quality_improvement_program_(cmaq) coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds corridor_mobility_improvement_account_(cmia)_program county_exchange_funds county_state_match_program earmarks_projects_(hpp,_demo_cpfcds,_etc.) emergency_relief_(er) ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program funds_for_planning,_programming_and_monitoring_-_rip general_funded_designated_programs hazard_elimination_safety_(hes) high_risk_rural_roads_program_(hr3) highway_bridge_ highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund highway_safety_improvement_program_(hsip)_(non-infrastructure) highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund local_partnership_program_(lpp_–_competitive)_ local_roads local_roads_rehabilitation railroad_grade_crossing_protection railroad_grade_separations rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_ regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system) regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip) regional_transportation_planning_agency_(rtpa)_stp_match_exchange sb1_funded_freeway_service_patrol shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds safe_routes_to_school_(sr2s_and_srts) set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act solutions_for_congested_corridors_program_(sccp) special_programs state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic) structures_seismic_retrofit_ trade_corridor_enhancement_account_(tcea)_programs_–_local_share trade_corridor_enhancement_account_(tcea)_programs_–_state_share trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads traffic_congestion_relief_program_(_tcrp_) unknown
\n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", "
project_id comment_desc district_code est_total_prj_costs location_name project_label_name original_post_mile_begin_id original_post_mile_end_id revised_post_mile_begin_ind revised_post_mile_end_ind route_name state_hwy_ind senate_district_code update_date_time agency_name urban_area_code county_name work_type_desc category_desc current_phase active_transportation_program_(atp) bridge_inspection_&_scour_evaluation covid_relief_funds_for_highway_infrastructure_programs_for_stip-covid_augmentation carbon_reduction_program_(crp) congestion_mitigation_&_air_quality_improvement_program_(cmaq) coronavirus_response_and_relief_supplemental_appropriations_act_(crrsaa)_funds corridor_mobility_improvement_account_(cmia)_program county_exchange_funds county_state_match_program earmarks_projects_(hpp,_demo_cpfcds,_etc.) emergency_relief_(er) ferry_boat_program_(fbp)_and_ferry_boat_discretionary_(fbd)_program funds_for_planning,_programming_and_monitoring_-_rip general_funded_designated_programs hazard_elimination_safety_(hes) high_risk_rural_roads_program_(hr3) highway_bridge_ highway_safety_improvement_program_(hsip)_(infrastructure)-state_fund highway_safety_improvement_program_(hsip)_(non-infrastructure) highway_safety_improvement_program_(hsip)(infrastructure)-federal_fund local_partnership_program_(lpp_–_competitive)_ local_roads local_roads_rehabilitation railroad_grade_crossing_protection railroad_grade_separations rebuilding_american_infrastructure_with_sustainability_and_equity_(raise)_and_multimodal_project_discretionary_grant_programs_(e.g.,_infra,_mega,_rstg_or_rural)_ regional_improvement_program_–_regional_share_of_stip_transportation_enhancement_(off_system) regional_surface_transportation_block_grant_program_(rstbgp)_and_highway_infrastructure_program_(hip) regional_transportation_planning_agency_(rtpa)_stp_match_exchange sb1_funded_freeway_service_patrol shopp-_traffic_light_synchronization_program_(tlsp)-_proposition_1b_bond_funds safe_routes_to_school_(sr2s_and_srts) set-aside_coordinated_border_infrastructure_(cbi)_program_under_fast_act solutions_for_congested_corridors_program_(sccp) special_programs state-local_partnership_program_(slpp)_and_local_partnership_program_(lpp-formulaic) structures_seismic_retrofit_ trade_corridor_enhancement_account_(tcea)_programs_–_local_share trade_corridor_enhancement_account_(tcea)_programs_–_state_share trade_corridors_improvement_fund_(tcif)_program_local_streets_&_roads traffic_congestion_relief_program_(_tcrp_) unknown
26645475(038)10/1/2020: Original AED date was 9/30/2020, new sequence being done to extend date to 9/30/2022. There will be a gap of time that is not reimbursable. CR\n", + " 24185475(038)10/1/2020: Original AED date was 9/30/2020, new sequence being done to extend date to 9/30/2022. There will be a gap of time that is not reimbursable. CR\n", "\n", "CMAQ Emissions Benefit: .03 ROG, .02 NOx, .01 PM10\n", "Project has EPSP approval for $2,646,524 of CMAQ for R/W to 15/16 FY. And EPSP for $291,476 of CMAQ for PE to 15/16 FY.\n", "0336291000.000000Auburn Blvd. Complete Streets - Phase 2. On Auburn Blvd, in Citrus Heights from Rusch Park to Northern City Limits.Pedestrian and Bike PathnannanNoneNone0-CHtsNNone2023-10-13 13:34:21Citrus Heights3067Sacramento CountyNonePedestrian and Bike Pathsingle phaseYesNoNoNoYesNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoYesNoNoNoNoNoYesNoNoNoNoNoNoNoNo0336291000.000000Auburn Blvd. Complete Streets - Phase 2. On Auburn Blvd, in Citrus Heights from Rusch Park to Northern City Limits.Pedestrian and Bike PathnannanNoneNone0-CHtsNNone2023-10-13 13:34:21Citrus Heights3067Sacramento CountyNonePedestrian and Bike Pathsingle phaseYesNoNoNoYesNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoNoYesNoNoNoNoNoYesNoNoNoNoNoNoNoNo
" ], "text/plain": [ - "" + "" ] }, - "execution_count": 277, + "execution_count": 137, "metadata": {}, "output_type": "execute_result" } @@ -4563,7 +3693,7 @@ }, { "cell_type": "markdown", - "id": "643a54b1", + "id": "6f411cf3", "metadata": {}, "source": [ "## Save to Excel/Final Touches" @@ -4571,8 +3701,8 @@ }, { "cell_type": "code", - "execution_count": 278, - "id": "ccc8fee4", + "execution_count": 138, + "id": "fa95350c", "metadata": {}, "outputs": [], "source": [ @@ -4583,17 +3713,17 @@ }, { "cell_type": "code", - "execution_count": 279, - "id": "12096bef", + "execution_count": 139, + "id": "d3b61876", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(11768, 62)" + "(11272, 62)" ] }, - "execution_count": 279, + "execution_count": 139, "metadata": {}, "output_type": "execute_result" } @@ -4604,17 +3734,17 @@ }, { "cell_type": "code", - "execution_count": 280, - "id": "f9e11b2a", + "execution_count": 140, + "id": "4b0c3eb1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "11768" + "11272" ] }, - "execution_count": 280, + "execution_count": 140, "metadata": {}, "output_type": "execute_result" } @@ -4625,8 +3755,8 @@ }, { "cell_type": "code", - "execution_count": 281, - "id": "d1896385", + "execution_count": 141, + "id": "63073ff2", "metadata": {}, "outputs": [], "source": [ @@ -4636,8 +3766,8 @@ }, { "cell_type": "code", - "execution_count": 282, - "id": "4a61a680", + "execution_count": 142, + "id": "eded55f3", "metadata": {}, "outputs": [], "source": [ @@ -4647,8 +3777,8 @@ }, { "cell_type": "code", - "execution_count": 283, - "id": "99035caf", + "execution_count": 143, + "id": "b65fdd45", "metadata": {}, "outputs": [], "source": [ @@ -4657,18 +3787,18 @@ }, { "cell_type": "code", - "execution_count": 284, - "id": "96b26252", + "execution_count": 147, + "id": "ebc5cc10", "metadata": {}, "outputs": [], "source": [ "\n", - "with pd.ExcelWriter(\"./LP2000.xlsx\") as writer:\n", + "with pd.ExcelWriter(\"LP2000_projects.xlsx\") as writer:\n", " project_df.to_excel(writer, sheet_name=\"project\", index=False)\n", " county_df.to_excel(writer, sheet_name=\"county\", index=False)\n", " district_df.to_excel(writer, sheet_name=\"district\", index=False)\n", " awards_df.to_excel(writer, sheet_name=\"awards\", index=False)\n", - " final_fund_phase_df.to_excel(writer, sheet_name=\"phase_funding\", index=False)\n" + " final_fund_phase_df2.to_excel(writer, sheet_name=\"phase_funding\", index=False)\n" ] } ], diff --git a/project_list/archive_compile_all_projects.ipynb b/project_list/archive_compile_all_projects.ipynb new file mode 100644 index 000000000..1863ec094 --- /dev/null +++ b/project_list/archive_compile_all_projects.ipynb @@ -0,0 +1,2642 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "77106c12-82aa-4be4-8d9c-e66fafec4d67", + "metadata": { + "tags": [] + }, + "source": [ + "## General function to clean up data from various grants\n", + "To-Do\n", + "* De duplicate projects\n", + "* Rearrange counties in County column in alphabetical order.\n", + "* Millions to thousands -> seems easier to read.\n", + "* Differentiate btwn project START year and END year.\n", + "* Add Post Mile column\n", + "\n", + "Done\n", + "* Switch City of Berkeley to Berkeley City. https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb\n", + "\n", + "Strategy/Questions:\n", + "* Make sure one row=one project. How? \n", + "* What should be the unit of project cost?\n", + "* Break up Caltrans by district or leave as is? \n", + "\n", + "Columns/Data Dictionary\n", + "* project_title (str): N/A.\n", + "* lead_agency (str): the entity leading the project or receiving the grant.\n", + "* project_year (TBD): when the project will begin.\n", + "* project_category (str): the category/categories a project belongs to.\n", + "* grant_program (str): the fund a project is receiving funds for. This does not preclude the fact that a project can receive funds from mulitple programs. \n", + "* phase (str): the latest phase the project is in.\n", + "* project_description (str): N/A.\n", + "* total_project_cost_(millions): N/A.\n", + "* total_available_funds_(millions): all the funds available to the project.\n", + "* unfunded_needs_(millions): subtract total_project_cost_(millions) by total_available_funds_(millionis).\n", + "* city (str): the city a project is located in.\n", + "* county (str): the county a project is lcoated in.\n", + "* location (str): an address or more detailed information regarding where the project will take place.\n", + "* geometry: geospatial information.\n", + "* data_source (str): N/A.\n", + "* notes (str): additional information regarding the project.\n", + "* funding_notes (str): additional funding information regarding the project.\n", + "* ct_district (int): the Caltrans district a project is located in.\n", + "* fully_funded (str): comparing total_available_funds_(millions) and total_project_cost_(millions) to figure out whether a project is fully, partially, or not funded.\n", + "* enough_info (str): counting the # of null values and # of strings in the project description to determine whether or not a project has enough information." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "06ac19fe-7b6c-4560-9740-8a4f72c5b6e1", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n", + " warnings.warn(\n", + "/home/jovyan/data-analyses/project_list/_sb1_utils.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", + "\n", + "import os\n", + "os.environ['USE_PYGEOS'] = '0'\n", + "import geopandas\n", + "\n", + "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", + " import geopandas as gpd\n" + ] + } + ], + "source": [ + "import _cleaning_utils\n", + "import _harmonization_utils as harmonization_utils\n", + "import _state_rail_plan_utils as srp_utils\n", + "# import geopandas as gpd\n", + "import pandas as pd\n", + "# import shapely\n", + "from calitp_data_analysis.sql import to_snakecase" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "d78be4e7-2349-4ffd-9d59-f9fa450ae7dd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "'\\nimport re\\nimport nltk\\nfrom nltk import ngrams\\nfrom nltk.corpus import stopwords\\nfrom nltk.tokenize import sent_tokenize, word_tokenize\\nimport re\\nfrom collections import Counter\\nfrom autocorrect import Speller\\n'" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\"\"\"\n", + "import re\n", + "import nltk\n", + "from nltk import ngrams\n", + "from nltk.corpus import stopwords\n", + "from nltk.tokenize import sent_tokenize, word_tokenize\n", + "import re\n", + "from collections import Counter\n", + "from autocorrect import Speller\n", + "\"\"\"" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "e7b68eeb-422d-4be8-b557-7bd9e95599af", + "metadata": {}, + "outputs": [], + "source": [ + "pd.options.display.max_columns = 100\n", + "pd.options.display.float_format = \"{:.2f}\".format\n", + "pd.set_option(\"display.max_rows\", None)\n", + "pd.set_option(\"display.max_colwidth\", None)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "0541b671-a020-485f-9b0a-f46238f1d4f9", + "metadata": {}, + "outputs": [], + "source": [ + "# lost = harmonization_utils.load_lost()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "db720477-44f5-4cbd-80ac-a0fe86e47cf9", + "metadata": {}, + "outputs": [], + "source": [ + "def create_notes(df, note_cols: list, new_col_name: str):\n", + " \"\"\"\n", + " Concat multiple columns into one.\n", + " \"\"\"\n", + " prefix = \"_\"\n", + " for column in note_cols:\n", + " df[f\"{prefix}{column}\"] = df[column].astype(str)\n", + " note_cols = [prefix + sub for sub in note_cols]\n", + "\n", + " # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values\n", + " def combine_notes(x):\n", + " return \", \".join([col + \": \" + x[col] for col in note_cols])\n", + "\n", + " df[new_col_name] = df.apply(combine_notes, axis=1)\n", + " df[new_col_name] = df[new_col_name].str.replace(\"_\", \" \")\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "78b5d13c-f4ba-4baf-8c3a-f520a960a44a", + "metadata": {}, + "outputs": [], + "source": [ + "# srp = harmonization_utils.load_state_rail_plan()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "f3829bd6-8fc1-4c15-809f-75020248a722", + "metadata": {}, + "outputs": [], + "source": [ + "columns_to_keep = [\n", + " \"project_title\",\n", + " \"lead_agency\",\n", + " \"project_year\",\n", + " \"project_category\",\n", + " \"project_start_year\",\n", + " \"project_completion_year\",\n", + " \"grant_program\",\n", + " \"phase\",\n", + " \"project_description\",\n", + " \"total_project_cost_(millions)\",\n", + " \"total_available_funds_(millions)\",\n", + " \"unfunded_needs_(millions)\",\n", + " \"city\",\n", + " \"county\",\n", + " \"location\",\n", + " \"post_mile\",\n", + " \"geometry\",\n", + " \"data_source\",\n", + " \"notes\",\n", + " \"funding_notes\",\n", + " \"ct_district\",\n", + " \"project_description2\",\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "291e821d-9b3f-40a2-bde9-7a12b31eb410", + "metadata": {}, + "outputs": [], + "source": [ + "def harmonizing(\n", + " df,\n", + " agency_name_col: str,\n", + " project_name_col: str,\n", + " project_description_col: str,\n", + " project_category_col: str,\n", + " phase_col: str,\n", + " project_cost_col: str,\n", + " location_col: str,\n", + " geography_col: str,\n", + " post_mile_col:str,\n", + " county_col: str,\n", + " city_col: str,\n", + " district_col:str, \n", + " project_start_year_col: str,\n", + " project_completion_year_col:str,\n", + " program_col: str,\n", + " data_source: str,\n", + " fund_cols: list,\n", + " notes_cols: list,\n", + " cost_in_millions: bool = True,\n", + "):\n", + " \"\"\"\n", + " Take a dataset and change the column names/types to\n", + " default names and formats.\n", + " \"\"\"\n", + " rename_columns = {\n", + " agency_name_col: \"lead_agency\",\n", + " project_name_col: \"project_title\",\n", + " project_description_col: \"project_description\",\n", + " project_category_col: \"project_category\",\n", + " project_cost_col: \"total_project_cost_(millions)\",\n", + " location_col: \"location\",\n", + " geography_col: \"geometry\",\n", + " phase_col: \"phase\",\n", + " post_mile_col: \"post_mile\",\n", + " county_col: \"county\",\n", + " city_col: \"city\",\n", + " district_col: \"ct_district\",\n", + " project_start_year_col: \"project_start_year\",\n", + " project_end_year_col: \"project_completion_year\",\n", + " program_col: \"grant_program\",\n", + " }\n", + " # Rename columns\n", + " df = df.rename(columns=rename_columns)\n", + " \n", + " # Clean up monetary columns to be interger\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + " for i in cost_columns:\n", + " df[i] = df[i].apply(pd.to_numeric, errors=\"coerce\").fillna(0)\n", + " \n", + " # Divide cost columns by millions\n", + " # If bool is set to True\n", + " if cost_in_millions:\n", + " for i in fund_cols + [\"total_project_cost_(millions)\"]:\n", + " df[i] = df[i].divide(1_000_000)\n", + "\n", + " # Add new column with funding breakout\n", + " # Since it's summarized above and the details are suppressed.\n", + " df[\"total_available_funds_(millions)\"] = df[fund_cols].sum(axis=1)\n", + " df = create_notes(df, fund_cols, \"funding_notes\")\n", + " \n", + " # Add column for unfunded needs\n", + " df[\"unfunded_needs_(millions)\"] = df[\"total_project_cost_(millions)\"] - df[\"total_available_funds_(millions)\"]\n", + " \n", + " # Add program\n", + " df[\"data_source\"] = data_source\n", + " \n", + " # Create columns even if they don't exist, just to harmonize\n", + " # before concatting.\n", + " create_columns = [\n", + " \"county\",\n", + " \"city\",\n", + " \"notes\",\n", + " \"project_start_year\",\n", + " \"project_completion_year\",\n", + " \"post_mile\",\n", + " \"project_category\",\n", + " \"location\",\n", + " \"phase\",\n", + " \"ct_district\"\n", + " ]\n", + " for column in create_columns:\n", + " if column not in df:\n", + " df[column] = \"None\"\n", + " if \"geometry\" not in df:\n", + " df[\"geometry\"] = None\n", + " if \"grant_program\" not in df:\n", + " df[\"grant_program\"] = data_source\n", + " \n", + " # Create notes - aka other columns that were supressed\n", + " df = create_notes(df, notes_cols, \"notes\")\n", + " \n", + " # Clean up string columns\n", + " string_cols = df.select_dtypes(include=[\"object\"]).columns.to_list()\n", + " for i in string_cols:\n", + " df[i] = df[i].str.replace(\"_\", \" \").str.strip().str.title()\n", + "\n", + " # Fill in any nulls\n", + " df['project_description2'] = df.project_description.fillna(df.project_title)\n", + " df = df.fillna(df.dtypes.replace({\"float64\": 0.0, \"object\": \"None\"}))\n", + "\n", + " # Only keep certain columns\n", + " df = df[columns_to_keep]\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "0ea5badb-841b-4941-b48f-23d750b5ed27", + "metadata": {}, + "outputs": [], + "source": [ + "def harmonize_srp():\n", + " df = harmonization_utils.load_state_rail_plan()\n", + " df = harmonizing(\n", + " df,\n", + " agency_name_col=\"lead_agency\",\n", + " project_name_col=\"project_name\",\n", + " project_description_col=\"project_description\",\n", + " project_category_col=\"project_category\",\n", + " phase_col=\"\",\n", + " project_cost_col=\"total_project_cost\",\n", + " location_col=\"corridor\",\n", + " geography_col=\"\",\n", + " county_col=\"\",\n", + " city_col=\"\",\n", + " district_col=\"\",\n", + " project_year_col=\"\",\n", + " program_col=\"\",\n", + " data_source=\"State Rail Plan\",\n", + " fund_cols=[],\n", + " notes_cols = ['project_time_horizon','srp_region', \n", + " 'sub_corridor_node_1', 'sub_corridor_node_2', 'itsp_corridor'],\n", + " cost_in_millions=True,\n", + " )\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "2b60a4e7-cc69-41fb-9285-c32f9fa0791e", + "metadata": {}, + "outputs": [], + "source": [ + "# srp_harmonized = harmonize_srp()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "2100f3b7-55c8-45ad-b3d1-99a0319c7ac8", + "metadata": {}, + "outputs": [], + "source": [ + "# srp_harmonized.tail()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "3142aacb-d5f4-4bc1-8cc8-99f50c45b301", + "metadata": {}, + "outputs": [], + "source": [ + "# srp_og = harmonization_utils.load_state_rail_plan()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "0ae0b8bd-3e5b-4119-8fee-d496689f9c7c", + "metadata": {}, + "outputs": [], + "source": [ + "# srp_og.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "9307a340-c699-4d93-ba30-abe04563dd8d", + "metadata": {}, + "outputs": [], + "source": [ + "# srp_og.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "23804222-466a-4754-a1ad-fd8f3f8a5239", + "metadata": {}, + "outputs": [], + "source": [ + "def harmonize_lost():\n", + " df = harmonization_utils.load_lost()\n", + " df = harmonizing(\n", + " df,\n", + " agency_name_col=\"agency\",\n", + " project_name_col=\"project_title\",\n", + " project_description_col=\"project_description\",\n", + " project_category_col=\"project_category\",\n", + " project_cost_col=\"cost__in_millions_\",\n", + " phase_col=\"\",\n", + " location_col=\"location\",\n", + " geography_col=\"\",\n", + " county_col=\"county\",\n", + " city_col=\"city\",\n", + " district_col = \"\",\n", + " project_year_col=\"\",\n", + " program_col=\"measure\",\n", + " data_source=\"Local Options Sales Tax\",\n", + " fund_cols=[\n", + " \"estimated_lost_funds\",\n", + " \"estimated_federal_funds\",\n", + " \"estimated_state_funds\",\n", + " \"estimated_local_funds\",\n", + " \"estimated_other_funds\",\n", + " ],\n", + " notes_cols = [\"notes\"],\n", + " cost_in_millions=False,\n", + " )\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "e13f87d5-514f-404f-8cc8-4dbf877754da", + "metadata": {}, + "outputs": [], + "source": [ + "# lost_og = harmonization_utils.load_lost()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "60d66f94-7915-43b7-990e-896600e20d40", + "metadata": {}, + "outputs": [], + "source": [ + "# lost_og.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "20e8a81a-e6b1-4bdf-a0f8-21420c62b68a", + "metadata": {}, + "outputs": [], + "source": [ + "def harmonize_sb1():\n", + " df = harmonization_utils.load_sb1()\n", + " df = harmonizing(\n", + " df,\n", + " agency_name_col=\"implementingagency\",\n", + " project_name_col=\"projecttitle_x\",\n", + " project_description_col=\"projectdescription\",\n", + " project_category_col=\"\",\n", + " phase_col=\"projectstatuses\",\n", + " project_cost_col=\"totalcost\",\n", + " location_col=\"\",\n", + " geography_col=\"geometry\",\n", + " county_col=\"countynames\",\n", + " city_col=\"citynames\",\n", + " district_col = \"ct_districts\",\n", + " project_year_col=\"fiscalyears\",\n", + " program_col=\"programcodes\",\n", + " data_source=\"SB1\",\n", + " fund_cols=[\"sb1funds\", \"iijafunds\"],\n", + " notes_cols = ['iijaprogram','dateupdated','isonshs', 'isonshscodes','agencies', 'popup'],\n", + " cost_in_millions=True,\n", + " )\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "4d39d086-ef36-4f21-ab44-17980304be74", + "metadata": {}, + "outputs": [], + "source": [ + "# sb1_og = harmonization_utils.load_sb1()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "9c65170e-17ef-42da-b161-358e40f815a8", + "metadata": {}, + "outputs": [], + "source": [ + "# sb1_og.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "5ade46ae-4768-4855-b0ea-9ff4ec7607af", + "metadata": {}, + "outputs": [], + "source": [ + "# sb1_og.drop(columns = ['geometry']).sample(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "5cab53c4-9c63-4bd4-b837-f43e62900e8d", + "metadata": {}, + "outputs": [], + "source": [ + "# harmonized_sb1 = harmonize_sb1()" + ] + }, + { + "cell_type": "markdown", + "id": "6c14df71-56af-43a1-b0e0-2d02ef38e18e", + "metadata": {}, + "source": [ + "### Stacking" + ] + }, + { + "cell_type": "markdown", + "id": "53a8e2a2-9d49-4e55-a2ee-bd6224d7fb61", + "metadata": { + "tags": [] + }, + "source": [ + "#### Does this project have enough information to be useful?" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "id": "3926aa77-991b-48be-b57d-04077a0a485b", + "metadata": {}, + "outputs": [], + "source": [ + "def categorize_info(df): \n", + " \n", + " #Get percentiles in objects for total vehicle.\n", + " p50_project_desc= df.project_description_count.quantile(0.50).astype(float)\n", + " p50_null_values = df.total_percent_null_values.quantile(0.50).astype(float)\n", + " \n", + " #Function for fleet size\n", + " def percentile_info (row):\n", + " if ((row.project_description_count >= p50_project_desc) and (row.total_percent_null_values <= p50_null_values)):\n", + " return \"Yes\"\n", + " else: \n", + " return \"No\"\n", + " df[\"enough_info\"] = df.apply(lambda x: percentile_info(x), axis=1)\n", + " \n", + " return df " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "48854cb1-3fa8-4d4e-8e8f-7218fc8b9c7e", + "metadata": {}, + "outputs": [], + "source": [ + "def enough_info(df):\n", + " # Select string columns\n", + " string_cols = df.select_dtypes(include=[\"object\"]).columns.to_list()\n", + " \n", + " # https://stackoverflow.com/questions/73839250/count-number-of-occurrences-of-text-over-row-python-pandas\n", + " # Count \"nones\" in string columns\n", + " df['none_counts'] = df[string_cols].astype(str).sum(axis=1).str.lower().str.count(\"none\")\n", + " \n", + " # Count zeroes\n", + " df['zero_counts'] = (df == 0).astype(int).sum(axis=1)\n", + " \n", + " # Total up all none/zeroes \n", + " df[\"total_percent_null_values\"] = df[['none_counts','zero_counts']].sum(axis=1)/len(df.columns) * 100\n", + " \n", + " # Count project descriptions\n", + " df[\"project_description_count\"] = df[\"project_description\"].str.count('\\w+')\n", + " \n", + " # Categorize whether it has enough info or not\n", + " df = categorize_info(df)\n", + " \n", + " # Compress columns to retain some info\n", + " df['counts'] = 'number of strings in project desc: ' + df.project_description_count.astype(str) + ' % of null values:' + df.total_percent_null_values.astype(int).astype(str)\n", + " \n", + " df = df.drop(columns = ['none_counts','zero_counts','project_description_count','total_percent_null_values'])\n", + " return df " + ] + }, + { + "cell_type": "markdown", + "id": "91c4e4b0-f28d-4956-9274-d17a3306801e", + "metadata": { + "tags": [] + }, + "source": [ + "#### Correct lead agencies again" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "f0f564ce-5551-4750-94b6-bb7c5b056949", + "metadata": {}, + "outputs": [], + "source": [ + "def flip_county_city(df, agency_col:str):\n", + " # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb\n", + " to_correct = df[(df[agency_col].str.contains('County')) | (df[agency_col].str.contains('City'))]\n", + " to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop = True)\n", + " to_correct['str_len'] = to_correct[agency_col].str.split().str.len()\n", + " to_correct = to_correct[to_correct.str_len <= 5 ].reset_index(drop = True)\n", + " to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)\n", + " to_correct['new_name'] = to_correct['name_pt2'] + ' ' + to_correct['name_pt1']\n", + " \n", + " new_names_dictionary = (dict(to_correct[[agency_col, 'new_name']].values))\n", + " df['agency_corrected'] = df[agency_col].map(new_names_dictionary)\n", + " df['agency_corrected'] = df['agency_corrected'].fillna(df[agency_col])\n", + " \n", + " df = df.drop(columns = [agency_col])\n", + " df = df.rename(columns = {\"agency_corrected\":agency_col})\n", + " \n", + " return df " + ] + }, + { + "cell_type": "code", + "execution_count": 126, + "id": "3946f71e-f987-452b-8269-331d6cb461c7", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# all_projects_metric.lead_agency.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "0bd2a79a-700b-446e-8346-5aa6fb2309f8", + "metadata": {}, + "outputs": [], + "source": [ + "def add_all_projects():\n", + "\n", + " # Load dataframes\n", + " state_rail_plan = harmonize_srp()\n", + " lost = harmonize_lost()\n", + " sb1 = harmonize_sb1()\n", + "\n", + " # Concat for df\n", + " df = pd.concat([lost, state_rail_plan, sb1])\n", + " \n", + " # Clean agency names\n", + " df = harmonization_utils.organization_cleaning(df, \"lead_agency\")\n", + " df = flip_county_city(df, 'lead_agency')\n", + " \n", + " # Determine if the project completely funded or not?\n", + " # Add up all available funds\n", + " df[\"fully_funded\"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)\n", + " \n", + " # Does this project have enough info?\n", + " df = enough_info(df)\n", + " \n", + " \n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "5dcafef7-30b9-4582-93c8-188ede6b8562", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + "/home/jovyan/data-analyses/project_list/_sb1_utils.py:30: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", + "/home/jovyan/data-analyses/project_list/_sb1_utils.py:37: FutureWarning: The default value of regex will change from True to False in a future version.\n", + "/home/jovyan/data-analyses/project_list/_sb1_utils.py:30: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", + "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + "/home/jovyan/data-analyses/project_list/_harmonization_utils.py:34: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n" + ] + } + ], + "source": [ + "all_projects = add_all_projects()" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "64c6a43d-0a8c-4f7c-a3cc-df3415163bf4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['project_title', 'lead_agency', 'project_year', 'project_category',\n", + " 'grant_program', 'phase', 'project_description',\n", + " 'total_project_cost_(millions)', 'total_available_funds_(millions)',\n", + " 'unfunded_needs_(millions)', 'city', 'county', 'location', 'geometry',\n", + " 'data_source', 'notes', 'funding_notes', 'ct_district',\n", + " 'project_description2', 'fully_funded', 'enough_info', 'counts'],\n", + " dtype='object')" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "id": "78825d55-c1b0-447b-b33e-493c7165aa25", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_titlelead_agencyproject_yearproject_categorygrant_programphaseproject_descriptiontotal_project_cost_(millions)total_available_funds_(millions)unfunded_needs_(millions)citycountylocationdata_sourcenotesfunding_notesct_districtproject_description2fully_fundedenough_infocounts
358NoneNone19/20, 20/21NoneSgrIn Progress, PlannedNone0.120.120.00CorcoranKingsNoneSb1Iijaprogram: , Dateupdated: 2021-09-09, Isonshs: N, Isonshscodes: N, Agencies: City Of Corcoran, Popup: NoneSb1Funds: 0.121909, Iijafunds: 0.0NoneNoneFully fundedNonumber of strings in project desc: 1 % of null values:40
1085Spring Street OverlayCity Of Signal Hill19/20NoneLsrPlannedNone3.000.132.87Signal HillLos AngelesNoneSb1Iijaprogram: , Dateupdated: 6/30/2021, Isonshs: N, Isonshscodes: N, Agencies: City Of Signal Hill, Popup: NoneSb1Funds: 0.126705, Iijafunds: 0.0NoneNonePartially fundedNonumber of strings in project desc: 1 % of null values:27
2106Major Damage RestorationCaltrans20/21NoneShoppIn ProgressA $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.16.529.087.44NoneDel NorteNoneSb1Iijaprogram: State Hwy Operations & Protection Program Major-Federal, Dateupdated: 2022-06-28, Isonshs: None, Isonshscodes: Y, Agencies: Caltrans, Popup: Major Damage RestorationbrSb1Funds: 0.0, Iijafunds: 9.08356601A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.Partially fundedYesnumber of strings in project desc: 25 % of null values:18
\n", + "
" + ], + "text/plain": [ + " project_title lead_agency project_year \\\n", + "358 None None 19/20, 20/21 \n", + "1085 Spring Street Overlay City Of Signal Hill 19/20 \n", + "2106 Major Damage Restoration Caltrans 20/21 \n", + "\n", + " project_category grant_program phase \\\n", + "358 None Sgr In Progress, Planned \n", + "1085 None Lsr Planned \n", + "2106 None Shopp In Progress \n", + "\n", + " project_description \\\n", + "358 None \n", + "1085 None \n", + "2106 A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System. \n", + "\n", + " total_project_cost_(millions) total_available_funds_(millions) \\\n", + "358 0.12 0.12 \n", + "1085 3.00 0.13 \n", + "2106 16.52 9.08 \n", + "\n", + " unfunded_needs_(millions) city county location \\\n", + "358 0.00 Corcoran Kings None \n", + "1085 2.87 Signal Hill Los Angeles None \n", + "2106 7.44 None Del Norte None \n", + "\n", + " data_source \\\n", + "358 Sb1 \n", + "1085 Sb1 \n", + "2106 Sb1 \n", + "\n", + " notes \\\n", + "358 Iijaprogram: , Dateupdated: 2021-09-09, Isonshs: N, Isonshscodes: N, Agencies: City Of Corcoran, Popup: None \n", + "1085 Iijaprogram: , Dateupdated: 6/30/2021, Isonshs: N, Isonshscodes: N, Agencies: City Of Signal Hill, Popup: None \n", + "2106 Iijaprogram: State Hwy Operations & Protection Program Major-Federal, Dateupdated: 2022-06-28, Isonshs: None, Isonshscodes: Y, Agencies: Caltrans, Popup: Major Damage Restorationbr \n", + "\n", + " funding_notes ct_district \\\n", + "358 Sb1Funds: 0.121909, Iijafunds: 0.0 None \n", + "1085 Sb1Funds: 0.126705, Iijafunds: 0.0 None \n", + "2106 Sb1Funds: 0.0, Iijafunds: 9.083566 01 \n", + "\n", + " project_description2 \\\n", + "358 None \n", + "1085 None \n", + "2106 A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System. \n", + "\n", + " fully_funded enough_info \\\n", + "358 Fully funded No \n", + "1085 Partially funded No \n", + "2106 Partially funded Yes \n", + "\n", + " counts \n", + "358 number of strings in project desc: 1 % of null values:40 \n", + "1085 number of strings in project desc: 1 % of null values:27 \n", + "2106 number of strings in project desc: 25 % of null values:18 " + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects.drop(columns = ['geometry']).sample(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "0983ba29-f492-4a1a-ad40-78ebd291f7d2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Shopp 1631\n", + "Imperial D 2008 726\n", + "Hm 520\n", + "Lsr 285\n", + "State Rail Plan 276\n", + "Atp 216\n", + "Sgr 156\n", + "Stip 126\n", + "San Mateo W 2018 91\n", + "Los Angeles Angeles M 2016 89\n", + "San Benito G 2004 86\n", + "Santa Clara B 2016 85\n", + "Tircp 82\n", + "Shopa 79\n", + "San Mateo A2 2006 78\n", + "Alameda B 2000 62\n", + "San Diego A 2004 59\n", + "San Joaquin K 2003 56\n", + "Tcep 55\n", + "San Bernardino I2 2018 51\n", + "Sacramento A2 2004 51\n", + "Tulare R 2006 49\n", + "Sta 49\n", + "Sonoma M 2004 44\n", + "Alameda Bb 2014 40\n", + "Lpp-F 40\n", + "Santa Barbara A 2008 37\n", + "Los Angeles Angeles R 2008 37\n", + "Madera T 2006 36\n", + "Sccp 34\n", + "San Francisco K 2004 28\n", + "Riverside A2 2006 27\n", + "Lpp-C 21\n", + "Stanislaus L 2016 20\n", + "Contra Costa J 2004 19\n", + "Orange M2 2002 19\n", + "Santa Clara A 2000 14\n", + "Sra 11\n", + "Monterey X 2016 11\n", + "Santa Cruz D 2016 9\n", + "Marin A 2004 7\n", + "Monterey Salinas Transit Q 2016 6\n", + "Sonoma Q 2008 5\n", + "Fresno C 2006 5\n", + "Los Angelest Alameda Bb 2014 1\n", + "Santa Clara B 2008 1\n", + "Name: grant_program, dtype: int64" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects.grant_program.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "301b9bde-499e-445d-a27c-f50f522e4aa9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Sb1 3305\n", + "Local Options Sales Tax 1849\n", + "State Rail Plan 276\n", + "Name: data_source, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects.data_source.value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "0c066920-6b09-4584-bc82-4f88b41e00d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.00 20.06\n", + "0.33 2.65\n", + "0.25 1.25\n", + "7.61 0.85\n", + "17.86 0.77\n", + "Name: total_project_cost_(millions), dtype: float64" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects[\"total_project_cost_(millions)\"].value_counts().head() / len(all_projects) * 100" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "413ac763-c08b-48b0-91d5-6e53fd8f2c32", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "No available funding info 1963\n", + "Partially funded 1796\n", + "No project cost info 1089\n", + "Fully funded 582\n", + "Name: fully_funded, dtype: int64" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects.fully_funded.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "8add5491-77d7-4eaa-ad79-57072f7eddd9", + "metadata": {}, + "source": [ + "### Metrics\n", + "* Rewrite to be shorter?\n", + "* Correct spelling of descriptions?\n", + "* https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "id": "6d6253cd-b5f8-4431-a575-9a274e6e8bae", + "metadata": {}, + "outputs": [], + "source": [ + "def add_categories(df):\n", + " \"\"\"\n", + " Create general categories for each projects.\n", + " https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305\n", + " \"\"\"\n", + " # There are many projects that are \n", + " ACTIVE_TRANSPORTATION = ['bike', 'bicycle', 'cyclist', \n", + " 'pedestrian', \n", + " ## including the spelling errors of `pedestrian`\n", + " 'pedestrain',\n", + " 'crosswalk', \n", + " 'bulb out', 'bulb-out', \n", + " 'active transp', 'traffic reduction', \n", + " 'speed reduction', 'ped', 'srts', \n", + " 'safe routes to school',\n", + " 'sidewalk', 'side walk', 'Cl ', 'trail',\n", + " 'atp'\n", + " ]\n", + " TRANSIT = ['bus', 'metro', 'station', #Station comes up a few times as a charging station and also as a train station\n", + " 'transit','fare', 'brt', 'yarts', 'railroad', 'highway-rail'\n", + " # , 'station' in description and 'charging station' not in description\n", + " ] \n", + " BRIDGE = [\"bridge\", 'viaduct']\n", + " STREET = ['traffic signal', 'resurface', 'resurfacing', 'slurry', 'seal' \n", + " 'sign', 'stripe', 'striping', 'median', \n", + " 'guard rail', 'guardrail', \n", + " 'road', 'street', \n", + " 'sinkhole', 'intersection', 'signal', 'curb',\n", + " 'light', 'tree', 'pavement', 'roundabout'\n", + " ]\n", + "\n", + " FREEWAY = ['hov ', 'hot ', 'freeway', 'highway', 'express lanes', 'hwy']\n", + "\n", + " INFRA_RESILIENCY_ER = ['repair', 'emergency', 'replace','retrofit', 'er',\n", + " 'rehab', 'improvements', 'seismic', 'reconstruct', 'restoration']\n", + "\n", + " CONGESTION_RELIEF = ['congestion', 'rideshare','ridesharing', 'vanpool', 'car share']\n", + "\n", + " NOT_INC = ['charging', 'fueling', 'cng', 'bridge', 'trail',\n", + " 'k-rail', 'guardrails', 'bridge rail', 'guard', 'guarrail']\n", + " \n", + " PASSENGER_MODE = ['non sov', 'high quality transit areas', \n", + " 'hqta', 'hov']\n", + " \n", + " \n", + " SAFETY = ['fatalities','safe', 'speed management','signal coordination',\n", + " 'slow speeds', 'roundabouts', 'victims','collisoins','protect',\n", + " 'crash', 'modification factors', 'safety system'] \n", + " \n", + " def categorize_project_descriptions(row):\n", + " \"\"\"\n", + " This function takes a individual type of work description (row of a dataframe)\n", + " and returns a dummy flag of 1 if it finds keyword present in\n", + " project categories (active transportation, transit, bridge, etc).\n", + " A description can contain multiple keywords across categories.\n", + " \"\"\"\n", + " # Clean up project description 2\n", + " project_description = (row.project_description2.lower()\n", + " .replace(\"-\",\"\")\n", + " .replace(\".\",\"\")\n", + " .replace(\":\",\"\")\n", + " )\n", + " \n", + " # Store a bunch of columns that will be flagged\n", + " # A project can involve multiple things...also, not sure what's in the descriptions\n", + " active_transp = \"\"\n", + " transit = \"\"\n", + " bridge =\"\"\n", + " street = \"\"\n", + " freeway = \"\"\n", + " infra_resiliency_er = \"\"\n", + " congestion_relief = \"\"\n", + " passenger_mode_shift = \"\"\n", + " safety = \"\"\n", + " \n", + " if any(word in project_description for word in ACTIVE_TRANSPORTATION):\n", + " active_transp = \"active transportation\"\n", + " \n", + " #if any(word in description if instanceof(word, str) else word(description) for word in TRANSIT)\n", + "\n", + " if (any(word in project_description for word in TRANSIT) and \n", + " not any(exclude_word in project_description for exclude_word in NOT_INC)\n", + " ):\n", + " transit = \"transit\"\n", + " if any(word in project_description for word in BRIDGE):\n", + " bridge = \"bridge\"\n", + " if any(word in project_description for word in STREET):\n", + " street = \"street\"\n", + " if any(word in project_description for word in FREEWAY):\n", + " freeway = \"freeway\" \n", + " if any(word in project_description for word in INFRA_RESILIENCY_ER):\n", + " infra_resiliency_er = \"infrastructure\"\n", + " if any(word in project_description for word in CONGESTION_RELIEF):\n", + " congestion_relief = \"congestion relief\" \n", + " if any(word in project_description for word in PASSENGER_MODE):\n", + " passenger_mode_shift = \"passenger mode shift\" \n", + " if any(word in project_description for word in SAFETY):\n", + " safety = \"safety\" \n", + " return pd.Series(\n", + " [active_transp, transit, bridge, street, freeway, infra_resiliency_er, congestion_relief,\n", + " passenger_mode_shift, safety], \n", + " index=['active_transp', 'transit', 'bridge', 'street', \n", + " 'freeway', 'infra_resiliency_er', 'congestion_relief',\n", + " 'passenger_mode_shift', 'safety']\n", + " )\n", + " \n", + " \n", + " work_categories = df.apply(categorize_project_descriptions, axis=1)\n", + " work_cols = list(work_categories.columns)\n", + " df2 = pd.concat([df, work_categories], axis=1)\n", + " \n", + " df2['categories'] = df2[work_cols].agg(' '.join, axis=1)\n", + " df2['categories'] = df2['categories'].str.strip()\n", + " df2 = df2.drop(columns = work_cols)\n", + " \n", + " return df2" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "6ea11daa-3a18-4d8a-9004-b2fc5e6d4343", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects_metric = add_categories(all_projects)" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "f3856c74-228d-4cf8-929a-cac486024586", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_titlelead_agencyproject_yearproject_categorygrant_programphaseproject_descriptiontotal_project_cost_(millions)total_available_funds_(millions)unfunded_needs_(millions)citycountylocationdata_sourcenotesfunding_notesct_districtproject_description2fully_fundedenough_infocountscategories
1886Safety - Hm4Caltrans21/22NoneHmIn ProgressMaintain/Repair Transportaiton Management Systems0.200.000.20VisaliaTulareNoneSb1Iijaprogram: None, Dateupdated: 2022-09-19, Isonshs: None, Isonshscodes: N, Agencies: Caltrans, Popup:Sb1Funds: 0.0, Iijafunds: 0.006Maintain/Repair Transportaiton Management SystemsNo available funding infoYesnumber of strings in project desc: 5 % of null values:22infrastructure
1119Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 SNoneNoneFreeway Safety And Congestion Relief ProgramSacramento A2 2004NoneNone47.000.0047.00NoneSacramentoNoneLocal Options Sales TaxNotes: No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category.Estimated Lost Funds: 0.0, Estimated Federal Funds: 0.0, Estimated State Funds: 0.0, Estimated Local Funds: 0, Estimated Other Funds: 0.0NoneBus/Carpool Ramp Connection From Sr 50 E To Sr 99 SNo available funding infoNonumber of strings in project desc: 1 % of null values:40transit
1589Highway 101: Betteravia Road InterchangeNoneNoneNoneSanta Barbara A 2008NoneImprove The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.2.005.00-3.00NoneSanta BarbaraNoneLocal Options Sales TaxNotes: NanEstimated Lost Funds: 2.0, Estimated Federal Funds: 0.0, Estimated State Funds: 0.0, Estimated Local Funds: 0, Estimated Other Funds: 3.0NoneImprove The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.Fully fundedNonumber of strings in project desc: 24 % of null values:36street freeway infrastructure
\n", + "
" + ], + "text/plain": [ + " project_title lead_agency \\\n", + "1886 Safety - Hm4 Caltrans \n", + "1119 Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S None \n", + "1589 Highway 101: Betteravia Road Interchange None \n", + "\n", + " project_year project_category \\\n", + "1886 21/22 None \n", + "1119 None Freeway Safety And Congestion Relief Program \n", + "1589 None None \n", + "\n", + " grant_program phase \\\n", + "1886 Hm In Progress \n", + "1119 Sacramento A2 2004 None \n", + "1589 Santa Barbara A 2008 None \n", + "\n", + " project_description \\\n", + "1886 Maintain/Repair Transportaiton Management Systems \n", + "1119 None \n", + "1589 Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant. \n", + "\n", + " total_project_cost_(millions) total_available_funds_(millions) \\\n", + "1886 0.20 0.00 \n", + "1119 47.00 0.00 \n", + "1589 2.00 5.00 \n", + "\n", + " unfunded_needs_(millions) city county location \\\n", + "1886 0.20 Visalia Tulare None \n", + "1119 47.00 None Sacramento None \n", + "1589 -3.00 None Santa Barbara None \n", + "\n", + " data_source \\\n", + "1886 Sb1 \n", + "1119 Local Options Sales Tax \n", + "1589 Local Options Sales Tax \n", + "\n", + " notes \\\n", + "1886 Iijaprogram: None, Dateupdated: 2022-09-19, Isonshs: None, Isonshscodes: N, Agencies: Caltrans, Popup: \n", + "1119 Notes: No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category. \n", + "1589 Notes: Nan \n", + "\n", + " funding_notes \\\n", + "1886 Sb1Funds: 0.0, Iijafunds: 0.0 \n", + "1119 Estimated Lost Funds: 0.0, Estimated Federal Funds: 0.0, Estimated State Funds: 0.0, Estimated Local Funds: 0, Estimated Other Funds: 0.0 \n", + "1589 Estimated Lost Funds: 2.0, Estimated Federal Funds: 0.0, Estimated State Funds: 0.0, Estimated Local Funds: 0, Estimated Other Funds: 3.0 \n", + "\n", + " ct_district \\\n", + "1886 06 \n", + "1119 None \n", + "1589 None \n", + "\n", + " project_description2 \\\n", + "1886 Maintain/Repair Transportaiton Management Systems \n", + "1119 Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S \n", + "1589 Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant. \n", + "\n", + " fully_funded enough_info \\\n", + "1886 No available funding info Yes \n", + "1119 No available funding info No \n", + "1589 Fully funded No \n", + "\n", + " counts \\\n", + "1886 number of strings in project desc: 5 % of null values:22 \n", + "1119 number of strings in project desc: 1 % of null values:40 \n", + "1589 number of strings in project desc: 24 % of null values:36 \n", + "\n", + " categories \n", + "1886 infrastructure \n", + "1119 transit \n", + "1589 street freeway infrastructure " + ] + }, + "execution_count": 67, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects_metric.drop(columns = ['geometry']).sample(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "ad99b589-1d78-4052-96ac-4617f0494544", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "infrastructure 1436\n", + " 1381\n", + "street infrastructure 739\n", + "street 372\n", + "bridge infrastructure 226\n", + "transit infrastructure 201\n", + "active transportation street infrastructure 106\n", + "transit 75\n", + "street infrastructure safety 58\n", + "transit street infrastructure 52\n", + "freeway infrastructure 52\n", + "bridge street infrastructure 45\n", + "bridge 44\n", + "active transportation infrastructure 44\n", + "active transportation 42\n", + "street freeway infrastructure safety 41\n", + "street freeway infrastructure 37\n", + "infrastructure safety 36\n", + "active transportation street infrastructure safety 29\n", + "freeway infrastructure passenger mode shift 22\n", + "active transportation transit infrastructure 21\n", + "freeway 20\n", + "freeway infrastructure safety 18\n", + "active transportation transit street infrastructure 17\n", + "bridge street infrastructure safety 14\n", + "bridge infrastructure safety 12\n", + "street infrastructure congestion relief 11\n", + "active transportation transit street infrastructure safety 11\n", + "passenger mode shift 11\n", + "street safety 10\n", + "Name: categories, dtype: int64" + ] + }, + "execution_count": 69, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects_metric.categories.value_counts().head(30)" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "270e8b35-cc6b-4461-835c-40c4b850916d", + "metadata": {}, + "outputs": [], + "source": [ + "def apply_metrics(df):\n", + " def categorize_metrics(row):\n", + " categories = row.categories.lower()\n", + " safety = \"\"\n", + " passenger_mode_shift = \"\"\n", + " infill_development = \"\"\n", + " \n", + " if any(word in categories for word in ['infrastructure','safety',]):\n", + " safety = \"safety\"\n", + " if any(word in categories for word in ['active transportation', 'passenger_mode_shift', \"congestion relief\"]):\n", + " passenger_mode_shift = \"passenger_mode_shift\"\n", + " if any(word in categories for word in ['transit', 'active transportation',]):\n", + " infill_development = \"infill_development\" \n", + " \n", + " return pd.Series(\n", + " [safety,passenger_mode_shift,infill_development], \n", + " index=['safety', 'passenger_mode_shift', 'infill_development']\n", + " )\n", + " \n", + " work_categories = df.apply(categorize_metrics, axis=1)\n", + " work_cols = list(work_categories.columns)\n", + " df2 = pd.concat([df, work_categories], axis=1)\n", + " \n", + " df2['applicable_metrics'] = df2[work_cols].agg(' '.join, axis=1)\n", + " df2['applicable_metrics'] = df2['applicable_metrics'].str.strip()\n", + " df2 = df2.drop(columns = work_cols)\n", + " \n", + " return df2" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "9a643de4-b6b3-4751-9a9f-b68abe4d7a22", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects_metric = apply_metrics(all_projects_metric)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "a6da3b49-dd3f-4b01-b394-23f44bf8e3a6", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
grant_programproject_description2categoriesapplicable_metrics
2587ShoppA $4.91 Million Dollar Project In Santa Barbara County On Route 154 Will Place High Friction Surface Treatment (Hfst) And Construct Centerline Rumble Strip.infrastructuresafety
1058LsrNone
17State Rail PlanExpansion Of The Smart Fleet To Accommodate Service Expansion.infrastructuresafety
845Imperial D 2008Overlayinfrastructuresafety
1933HmMaintain/Repair Pavement - Seal Coatstreet infrastructuresafety
2032ShoppA $11.57 Million Dollar Project In Humboldt County On Route 299 Will Widen Shoulders.infrastructuresafety
331SgrNone
3222ShoppA $5.8 Million Dollar Project In San Diego County On Route 5 Will Apply Polyester Concrete Overlay To Bridge Decks, Apply Methacrylate To Approach Slabs, And Repair Spalls. (Bridge Deck Preservation)bridge infrastructuresafety
106State Rail PlanDouble Track From Mp 436.65 To Cp Santa Susana To Allow At-Speed Meets At 437.4. Add 2Nd Platform At Simi Valley Station To Allow Boarding From Both Tracks.transitinfill_development
1092LsrNone
1703StipNear The City Of Tulare, On Route 65 From Lindsay To Exeter, And On Road 204 From Route 137 To Route 198. Widen To 4 Divided Lanes And Realign Highway.street freeway infrastructuresafety
753Imperial D 2008Overlayinfrastructuresafety
43State Rail PlanCaltrain Electrification Will Electrify The Corridor From San Francisco Caltrain Station To The Tamien Caltrain Station. Electrification Improvements Include Converting Diesel-Hauled Trains To Electric Trains, Increasing Service To Six Trains Per Peak Hour Per Direction, And Maintaining Operating Speed Up To 79 Mph.transit infrastructuresafety infill_development
627HmMaintain/Repair Transportaiton Management Systemsinfrastructuresafety
900Los Angeles Angeles M 2016Transportation System And Mobility Improve Program
2016ShoppA $7.4 Million Dollar Project In Mendocino County On Route 1 Will Widen For Standard Shoulders, Improve Roadway Cross Slope, And Install Rumble Strips And Guardrail.street infrastructuresafety
1759HmRepair/Replace Culvertsinfrastructuresafety
2082ShoppA $24.63 Million Dollar Project In Mendocino County On Route 128 Will Rehabilitate Pavement, Upgrade Transportation Management System (Tms) Elements, Guardrails, And Sign Panels, Upgrade Facilities To Americans With Disabilities Act (Ada) Standards, And Construct Complete Streets Elements.street infrastructuresafety
1488ShoppA $6.28 Million Dollar Project In San Bernardino County On Route Var Will Upgrade Traffic Census Stations.transit infrastructuresafety infill_development
1886HmMaintain/Repair Transportaiton Management Systemsinfrastructuresafety
311Imperial D 2008Reconstructioninfrastructuresafety
1335ShoppA $4.92 Million Dollar Project In San Mateo County On Route 82 Will Upgrade Bridge Rails And Signals And Upgrade Facilities To Americans With Disabilities Act (Ada) Standards.bridge street infrastructuresafety
487HmMaintain/Repair Maintenance Stationtransit infrastructuresafety infill_development
471SgrNone
901Los Angeles Angeles M 2016Active Transportation 1St/Last Mile Connections Progactive transportationpassenger_mode_shift infill_development
456Imperial D 2008Lincoln Ave From Rose Ave To Weakley St S
1140LsrNone
3275ShoppA $36.09 Million Dollar Project In Orange County On Route 22 Will Install Safety Lighting And Upgrade Median Barrier, Drainage Systems, And Signs.street infrastructure safetysafety
3176ShoppA $4.53 Million Dollar Project In San Diego County On Route 94 Will Construct And Upgrade Pedestrian Curb Ramps And Sidewalks To Meet Current Standards.active transportation streetpassenger_mode_shift infill_development
1540San Mateo W 2018Pedestrian Accessibility Improvements Citywideactive transportation infrastructuresafety passenger_mode_shift infill_development
1318San Diego A 20048F+2Hovpassenger mode shift
3135ShoppA $2.87 Million Dollar Project In San Joaquin County On Route 99 Will Apply High Friction Surface Treatment (Hfst).
320Imperial D 2008Overlayinfrastructuresafety
1633TcepIn San Bernardino And Riverside Counties Through The Cities Of Eastvale, Jurupa Valley, Ontario, And Rancho Cucamonga, On I-15 From Cantu Galleano Road To Foothill Boulevard. This Project Will Construct The Toll System Needed To Operate The I-15 Express Lanes Project (08-0167M).street freeway infrastructuresafety
1376San Joaquin K 2003Widen From 2 To 4 Lanes Between I-5 And The Lodi City Limits
3036ShoppA $4.57 Million Dollar Project In Riverside County On Route 60 Will Reduce Wrong-Way Collisions By Installing Wrong-Way Pavement Markers And Sign Panels, And Upgrading Pavement Markings At Onramps And Offramps.street infrastructuresafety
826Imperial D 2008Reconstructinfrastructuresafety
340Imperial D 2008Crack Seal/Slurry Coatstreet
2405ShoppA $7.14 Million Dollar Project In Santa Clara County On Route 152 Will Rehabilitate Drainage Systems, Upgrade Guardrail, And Pave Roadside Areas To Prevent Vegetation Growth And Enhance Highway Worker Safety.street freeway infrastructure safetysafety
814Imperial D 2008Overlayinfrastructuresafety
44TircpNone
1041LsrNone
134State Rail PlanDesign And Construct A New Station And Platform In The Oakley Civic Center On The San Joaquins Route Between Oakland And Stockton. This Station Is Five Miles From The Existing Antioch/Pitsburg Station And Will Serve The Communities Of Oakley And Brentwood.transit infrastructuresafety infill_development
2009ShoppA $104.39 Million Dollar Project In Del Norte County On Route 101 Will Replace Bridge.bridge infrastructuresafety
12Alameda B 2000I-580 Interchange Improvements In Castro Valleyinfrastructuresafety
2107ShoppA $9.16 Million Dollar Project In Mendocino County On Route 020 Will Safety Improvements.infrastructure safetysafety
1371ShoppA $9.7 Million Dollar Project In Santa Barbara County On Route 154 Will Replace Bridge.bridge infrastructuresafety
973Los Angeles Angeles R 2008Bus Operations (Countywide Bus Service Operations,\\nMaintenance, And Expansion. Suspend A Scheduled\\nJuly 1, 2009 Metro Fare Increase For One Year And\\nFreeze All Metro Student, Senior, Disabled, And\\nMedicare Fares Through June 30, 2013 By Instead\\nUsing Metro'S Formula Allocation Procedure Share Of\\nThis Subfund.)transit infrastructuresafety infill_development
1823Tulare R 2006Over Crossinginfrastructuresafety
97State Rail PlanDouble Track Between Cp Canyon (Newhall Siding) And Cp Hood (Canyon) To Allow 15- Minute Service Between La And Santa Clarita.infrastructuresafety
\n", + "
" + ], + "text/plain": [ + " grant_program \\\n", + "2587 Shopp \n", + "1058 Lsr \n", + "17 State Rail Plan \n", + "845 Imperial D 2008 \n", + "1933 Hm \n", + "2032 Shopp \n", + "331 Sgr \n", + "3222 Shopp \n", + "106 State Rail Plan \n", + "1092 Lsr \n", + "1703 Stip \n", + "753 Imperial D 2008 \n", + "43 State Rail Plan \n", + "627 Hm \n", + "900 Los Angeles Angeles M 2016 \n", + "2016 Shopp \n", + "1759 Hm \n", + "2082 Shopp \n", + "1488 Shopp \n", + "1886 Hm \n", + "311 Imperial D 2008 \n", + "1335 Shopp \n", + "487 Hm \n", + "471 Sgr \n", + "901 Los Angeles Angeles M 2016 \n", + "456 Imperial D 2008 \n", + "1140 Lsr \n", + "3275 Shopp \n", + "3176 Shopp \n", + "1540 San Mateo W 2018 \n", + "1318 San Diego A 2004 \n", + "3135 Shopp \n", + "320 Imperial D 2008 \n", + "1633 Tcep \n", + "1376 San Joaquin K 2003 \n", + "3036 Shopp \n", + "826 Imperial D 2008 \n", + "340 Imperial D 2008 \n", + "2405 Shopp \n", + "814 Imperial D 2008 \n", + "44 Tircp \n", + "1041 Lsr \n", + "134 State Rail Plan \n", + "2009 Shopp \n", + "12 Alameda B 2000 \n", + "2107 Shopp \n", + "1371 Shopp \n", + "973 Los Angeles Angeles R 2008 \n", + "1823 Tulare R 2006 \n", + "97 State Rail Plan \n", + "\n", + " project_description2 \\\n", + "2587 A $4.91 Million Dollar Project In Santa Barbara County On Route 154 Will Place High Friction Surface Treatment (Hfst) And Construct Centerline Rumble Strip. \n", + "1058 None \n", + "17 Expansion Of The Smart Fleet To Accommodate Service Expansion. \n", + "845 Overlay \n", + "1933 Maintain/Repair Pavement - Seal Coat \n", + "2032 A $11.57 Million Dollar Project In Humboldt County On Route 299 Will Widen Shoulders. \n", + "331 None \n", + "3222 A $5.8 Million Dollar Project In San Diego County On Route 5 Will Apply Polyester Concrete Overlay To Bridge Decks, Apply Methacrylate To Approach Slabs, And Repair Spalls. (Bridge Deck Preservation) \n", + "106 Double Track From Mp 436.65 To Cp Santa Susana To Allow At-Speed Meets At 437.4. Add 2Nd Platform At Simi Valley Station To Allow Boarding From Both Tracks. \n", + "1092 None \n", + "1703 Near The City Of Tulare, On Route 65 From Lindsay To Exeter, And On Road 204 From Route 137 To Route 198. Widen To 4 Divided Lanes And Realign Highway. \n", + "753 Overlay \n", + "43 Caltrain Electrification Will Electrify The Corridor From San Francisco Caltrain Station To The Tamien Caltrain Station. Electrification Improvements Include Converting Diesel-Hauled Trains To Electric Trains, Increasing Service To Six Trains Per Peak Hour Per Direction, And Maintaining Operating Speed Up To 79 Mph. \n", + "627 Maintain/Repair Transportaiton Management Systems \n", + "900 Transportation System And Mobility Improve Program \n", + "2016 A $7.4 Million Dollar Project In Mendocino County On Route 1 Will Widen For Standard Shoulders, Improve Roadway Cross Slope, And Install Rumble Strips And Guardrail. \n", + "1759 Repair/Replace Culverts \n", + "2082 A $24.63 Million Dollar Project In Mendocino County On Route 128 Will Rehabilitate Pavement, Upgrade Transportation Management System (Tms) Elements, Guardrails, And Sign Panels, Upgrade Facilities To Americans With Disabilities Act (Ada) Standards, And Construct Complete Streets Elements. \n", + "1488 A $6.28 Million Dollar Project In San Bernardino County On Route Var Will Upgrade Traffic Census Stations. \n", + "1886 Maintain/Repair Transportaiton Management Systems \n", + "311 Reconstruction \n", + "1335 A $4.92 Million Dollar Project In San Mateo County On Route 82 Will Upgrade Bridge Rails And Signals And Upgrade Facilities To Americans With Disabilities Act (Ada) Standards. \n", + "487 Maintain/Repair Maintenance Station \n", + "471 None \n", + "901 Active Transportation 1St/Last Mile Connections Prog \n", + "456 Lincoln Ave From Rose Ave To Weakley St S \n", + "1140 None \n", + "3275 A $36.09 Million Dollar Project In Orange County On Route 22 Will Install Safety Lighting And Upgrade Median Barrier, Drainage Systems, And Signs. \n", + "3176 A $4.53 Million Dollar Project In San Diego County On Route 94 Will Construct And Upgrade Pedestrian Curb Ramps And Sidewalks To Meet Current Standards. \n", + "1540 Pedestrian Accessibility Improvements Citywide \n", + "1318 8F+2Hov \n", + "3135 A $2.87 Million Dollar Project In San Joaquin County On Route 99 Will Apply High Friction Surface Treatment (Hfst). \n", + "320 Overlay \n", + "1633 In San Bernardino And Riverside Counties Through The Cities Of Eastvale, Jurupa Valley, Ontario, And Rancho Cucamonga, On I-15 From Cantu Galleano Road To Foothill Boulevard. This Project Will Construct The Toll System Needed To Operate The I-15 Express Lanes Project (08-0167M). \n", + "1376 Widen From 2 To 4 Lanes Between I-5 And The Lodi City Limits \n", + "3036 A $4.57 Million Dollar Project In Riverside County On Route 60 Will Reduce Wrong-Way Collisions By Installing Wrong-Way Pavement Markers And Sign Panels, And Upgrading Pavement Markings At Onramps And Offramps. \n", + "826 Reconstruct \n", + "340 Crack Seal/Slurry Coat \n", + "2405 A $7.14 Million Dollar Project In Santa Clara County On Route 152 Will Rehabilitate Drainage Systems, Upgrade Guardrail, And Pave Roadside Areas To Prevent Vegetation Growth And Enhance Highway Worker Safety. \n", + "814 Overlay \n", + "44 None \n", + "1041 None \n", + "134 Design And Construct A New Station And Platform In The Oakley Civic Center On The San Joaquins Route Between Oakland And Stockton. This Station Is Five Miles From The Existing Antioch/Pitsburg Station And Will Serve The Communities Of Oakley And Brentwood. \n", + "2009 A $104.39 Million Dollar Project In Del Norte County On Route 101 Will Replace Bridge. \n", + "12 I-580 Interchange Improvements In Castro Valley \n", + "2107 A $9.16 Million Dollar Project In Mendocino County On Route 020 Will Safety Improvements. \n", + "1371 A $9.7 Million Dollar Project In Santa Barbara County On Route 154 Will Replace Bridge. \n", + "973 Bus Operations (Countywide Bus Service Operations,\\nMaintenance, And Expansion. Suspend A Scheduled\\nJuly 1, 2009 Metro Fare Increase For One Year And\\nFreeze All Metro Student, Senior, Disabled, And\\nMedicare Fares Through June 30, 2013 By Instead\\nUsing Metro'S Formula Allocation Procedure Share Of\\nThis Subfund.) \n", + "1823 Over Crossing \n", + "97 Double Track Between Cp Canyon (Newhall Siding) And Cp Hood (Canyon) To Allow 15- Minute Service Between La And Santa Clarita. \n", + "\n", + " categories \\\n", + "2587 infrastructure \n", + "1058 \n", + "17 infrastructure \n", + "845 infrastructure \n", + "1933 street infrastructure \n", + "2032 infrastructure \n", + "331 \n", + "3222 bridge infrastructure \n", + "106 transit \n", + "1092 \n", + "1703 street freeway infrastructure \n", + "753 infrastructure \n", + "43 transit infrastructure \n", + "627 infrastructure \n", + "900 \n", + "2016 street infrastructure \n", + "1759 infrastructure \n", + "2082 street infrastructure \n", + "1488 transit infrastructure \n", + "1886 infrastructure \n", + "311 infrastructure \n", + "1335 bridge street infrastructure \n", + "487 transit infrastructure \n", + "471 \n", + "901 active transportation \n", + "456 \n", + "1140 \n", + "3275 street infrastructure safety \n", + "3176 active transportation street \n", + "1540 active transportation infrastructure \n", + "1318 passenger mode shift \n", + "3135 \n", + "320 infrastructure \n", + "1633 street freeway infrastructure \n", + "1376 \n", + "3036 street infrastructure \n", + "826 infrastructure \n", + "340 street \n", + "2405 street freeway infrastructure safety \n", + "814 infrastructure \n", + "44 \n", + "1041 \n", + "134 transit infrastructure \n", + "2009 bridge infrastructure \n", + "12 infrastructure \n", + "2107 infrastructure safety \n", + "1371 bridge infrastructure \n", + "973 transit infrastructure \n", + "1823 infrastructure \n", + "97 infrastructure \n", + "\n", + " applicable_metrics \n", + "2587 safety \n", + "1058 \n", + "17 safety \n", + "845 safety \n", + "1933 safety \n", + "2032 safety \n", + "331 \n", + "3222 safety \n", + "106 infill_development \n", + "1092 \n", + "1703 safety \n", + "753 safety \n", + "43 safety infill_development \n", + "627 safety \n", + "900 \n", + "2016 safety \n", + "1759 safety \n", + "2082 safety \n", + "1488 safety infill_development \n", + "1886 safety \n", + "311 safety \n", + "1335 safety \n", + "487 safety infill_development \n", + "471 \n", + "901 passenger_mode_shift infill_development \n", + "456 \n", + "1140 \n", + "3275 safety \n", + "3176 passenger_mode_shift infill_development \n", + "1540 safety passenger_mode_shift infill_development \n", + "1318 \n", + "3135 \n", + "320 safety \n", + "1633 safety \n", + "1376 \n", + "3036 safety \n", + "826 safety \n", + "340 \n", + "2405 safety \n", + "814 safety \n", + "44 \n", + "1041 \n", + "134 safety infill_development \n", + "2009 safety \n", + "12 safety \n", + "2107 safety \n", + "1371 safety \n", + "973 safety infill_development \n", + "1823 safety \n", + "97 safety " + ] + }, + "execution_count": 90, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects_metric[['grant_program','project_description2','categories','applicable_metrics']].sample(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "9ea21916-dd50-4396-850b-87ea2535c9f4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "7" + ] + }, + "execution_count": 86, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "all_projects_metric.applicable_metrics.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "1ad5b6ae-9407-46ae-b2ff-c9ad6cbea83c", + "metadata": {}, + "source": [ + "### Categorization" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "f9a86d28-9b77-48c3-ba6d-27dc360f2fd0", + "metadata": {}, + "outputs": [], + "source": [ + "def get_list_of_words(df, col: str) -> list:\n", + " \"\"\"\n", + " Natalie's function to clean and place words in a project description column\n", + " into a list\n", + " \"\"\"\n", + " # get just the one col\n", + " column = df[[col]]\n", + "\n", + " # remove single-dimensional entries from the shape of an array\n", + " col_text = column.squeeze()\n", + " # get list of words\n", + " text_list = col_text.tolist()\n", + "\n", + " # Join all the column into one large text blob, lower text\n", + " text_list = \" \".join(text_list).lower()\n", + "\n", + " # remove punctuation\n", + " text_list = re.sub(r\"[^\\w\\s]\", \"\", text_list)\n", + "\n", + " # List of stopwords\n", + " swords = [re.sub(r\"[^A-z\\s]\", \"\", sword) for sword in stopwords.words(\"english\")]\n", + "\n", + " # Remove stopwords\n", + " clean_text_list = [\n", + " word for word in word_tokenize(text_list.lower()) if word not in swords\n", + " ]\n", + "\n", + " return clean_text_list" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "cd602787-2444-49c5-8bb8-c59a63975de5", + "metadata": {}, + "outputs": [], + "source": [ + "def find_common_phrases(df, description_column: str, values_to_add: list):\n", + "\n", + " # Break apart every word in the description column into a list\n", + " descriptions_list = get_list_of_words(df, description_column)\n", + "\n", + " # Get phrases of whatever length you want (2,3,4,etc)\n", + " c = Counter([\" \".join(y) for x in [2] for y in ngrams(descriptions_list, x)])\n", + "\n", + " # Make a dataframe out of the counter values\n", + " df_phrases = pd.DataFrame({\"phrases\": list(c.keys()), \"total\": list(c.values())})\n", + "\n", + " # Take phrases that are repeated more than 40 times and turn it into a list\n", + " df_phrases = ((df_phrases.loc[df_phrases[\"total\"] > 40])).reset_index(drop=True)\n", + " common_phrases_list = df_phrases.phrases.tolist()\n", + "\n", + " phrases_to_del = [\n", + " \"san bernardino\",\n", + " \"los angeles\",\n", + " \"contra costa\",\n", + " \"el dorado\",\n", + " \"san luis obispo\",\n", + " \"luis obispo\",\n", + " \"del norte\",\n", + " \"san francisco\",\n", + " \"improve approximately\",\n", + " ]\n", + "\n", + " common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))\n", + "\n", + " # CLean up the list to delete county information/etc\n", + " words_to_delete = [\n", + " \"county\",\n", + " \"route\",\n", + " \"dollar\",\n", + " \"mile\",\n", + " \"santa\",\n", + " \"project\",\n", + " \"san\",\n", + " \"lanes\",\n", + " \"lane\",\n", + " \"2\",\n", + " \"4\",\n", + " \"financial\",\n", + " \"prop\",\n", + " \"best\",\n", + " \"approximately\",\n", + " ]\n", + "\n", + " for word in words_to_delete:\n", + " common_phrases_list = [x for x in common_phrases_list if word not in x]\n", + "\n", + " # ADD certain keywords here\n", + " # Operating Additional Service\n", + " common_phrases_list.extend(values_to_add)\n", + "\n", + " return common_phrases_list" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "ec139873-4bb7-4428-9fd7-ceb9e247d4a3", + "metadata": {}, + "outputs": [], + "source": [ + "def categorize_projects(\n", + " df,\n", + " description_column: str,\n", + " project_id_column: str,\n", + " title_column: str,\n", + " values_to_add: list,\n", + "):\n", + "\n", + " # Find most common 2 word phrases for some automatic project categories\n", + " common_phrases_list = find_common_phrases(df, description_column, values_to_add)\n", + "\n", + " # Place all the words in common_phrases_list into a blob named query\n", + " # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa\n", + " query = \"|\".join(common_phrases_list)\n", + "\n", + " # Remove punctation and lower strings in original description column befores searching\n", + " df[\"clean_description\"] = (\n", + " df[description_column]\n", + " .str.lower()\n", + " .str.replace(\"-\", \" \", regex=True)\n", + " .str.replace(\"(\", \" \", regex=True)\n", + " .str.replace(\")\", \" \", regex=True)\n", + " .str.replace(\".\", \" \", regex=True)\n", + " .str.strip()\n", + " )\n", + "\n", + " # Search through description column for the most common phrases\n", + " # Input the results in the new column\n", + " df[\"auto_project_category\"] = df[\"clean_description\"].str.findall(\n", + " r\"\\b({})\\b\".format(query)\n", + " )\n", + "\n", + " # Explode to take categories out of a list\n", + " # Drop duplicate project keywords by title\n", + " df = (\n", + " df.explode(\"auto_project_category\")\n", + " .sort_values([project_id_column, title_column])\n", + " .drop_duplicates(\n", + " subset=[\n", + " description_column,\n", + " project_id_column,\n", + " title_column,\n", + " \"auto_project_category\",\n", + " ]\n", + " )\n", + " )\n", + "\n", + " # Fill any uncategorized projects as \"Other\"\n", + " df[\"auto_project_category\"] = (\n", + " df[\"auto_project_category\"].fillna(\"Other\").str.title()\n", + " )\n", + "\n", + " # Correct spelling\n", + " spell = Speller(lang=\"en\")\n", + " df[\"auto_project_category\"] = df[\"auto_project_category\"].apply(\n", + " lambda x: \" \".join([spell(i) for i in x.split()])\n", + " )\n", + "\n", + " # Summarize - put all the categories onto one line\n", + " df = (\n", + " df.groupby(\n", + " [\n", + " description_column,\n", + " project_id_column,\n", + " title_column,\n", + " ]\n", + " )[\"auto_project_category\"]\n", + " .apply(\",\".join)\n", + " .reset_index()\n", + " )\n", + "\n", + " return df" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "d123f3b9-da23-4d4d-a2e2-dc3769100171", + "metadata": {}, + "outputs": [], + "source": [ + "def add_all_projects2():\n", + "\n", + " # Load dataframes\n", + " state_rail_plan = harmonize_srp()\n", + " lost = harominze_lost()\n", + " sb1 = harmonize_sb1()\n", + "\n", + " # Concat for df\n", + " all_projects_df = pd.concat([lost, state_rail_plan, sb1])\n", + "\n", + " # Categorize\n", + " categories = categorize_projects(\n", + " all_projects_df,\n", + " \"project_description\",\n", + " \"project_title\",\n", + " \"project_id\",\n", + " [\n", + " \"operating\",\n", + " \"service\",\n", + " \"zero emission vehicle\",\n", + " \"zev\",\n", + " \"maintain/repair\",\n", + " \"repair/replace\",\n", + " ],\n", + " )\n", + "\n", + " # Merge categorized\n", + " all_projects_df = pd.merge(\n", + " all_projects_df.drop(columns=[\"clean_description\"]),\n", + " categories,\n", + " how=\"left\",\n", + " on=[\"project_description\", \"project_title\", \"project_id\"],\n", + " )\n", + "\n", + " # Rename\n", + " all_projects_df = all_projects_df.drop(columns=[\"auto_project_category_x\"]).rename(\n", + " columns={\"auto_project_category_y\": \"auto_tagged_project_categories\"}\n", + " )\n", + " # Concat for gdf\n", + " all_projects_gdf = pd.concat([sb1])\n", + " all_projects_gdf = all_projects_gdf.set_geometry(\"location\")\n", + "\n", + " return all_projects_df, all_projects_gdf" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "87a29e05-0ba6-40cb-93e2-d097159e6235", + "metadata": {}, + "outputs": [], + "source": [ + "# all_projects, all_projects_geo = add_all_projects()" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "id": "873a88ca-5a47-4bfe-a1d3-715a5bed05bb", + "metadata": { + "scrolled": true, + "tags": [] + }, + "outputs": [], + "source": [ + "# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)" + ] + }, + { + "cell_type": "markdown", + "id": "85cfedf8-14aa-4d6c-b30e-cc9f6ee5bbf8", + "metadata": {}, + "source": [ + "### Look at the data" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "id": "a7e39b78-af8b-4bc5-8911-572839a72b36", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "ename": "KeyError", + "evalue": "\"Column(s) ['project_id'] do not exist\"", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[51], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mall_projects\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlead_agency\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mproject_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnunique\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39msort_values(\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproject_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, ascending\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 3\u001b[0m )\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m10\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/groupby/generic.py:895\u001b[0m, in \u001b[0;36mDataFrameGroupBy.aggregate\u001b[0;34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[0m\n\u001b[1;32m 892\u001b[0m func \u001b[38;5;241m=\u001b[39m maybe_mangle_lambdas(func)\n\u001b[1;32m 894\u001b[0m op \u001b[38;5;241m=\u001b[39m GroupByApply(\u001b[38;5;28mself\u001b[39m, func, args, kwargs)\n\u001b[0;32m--> 895\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 896\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dict_like(func) \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", + "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:172\u001b[0m, in \u001b[0;36mApply.agg\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_str()\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_dict_like(arg):\n\u001b[0;32m--> 172\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(arg):\n\u001b[1;32m 174\u001b[0m \u001b[38;5;66;03m# we require a list, but not a 'str'\u001b[39;00m\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39magg_list_like()\n", + "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:496\u001b[0m, in \u001b[0;36mApply.agg_dict_like\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 493\u001b[0m selected_obj \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_selected_obj\n\u001b[1;32m 494\u001b[0m selection \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_selection\n\u001b[0;32m--> 496\u001b[0m arg \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnormalize_dictlike_arg\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43magg\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m selected_obj\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 499\u001b[0m \u001b[38;5;66;03m# key only used for output\u001b[39;00m\n\u001b[1;32m 500\u001b[0m colg \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_gotitem(selection, ndim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", + "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:619\u001b[0m, in \u001b[0;36mApply.normalize_dictlike_arg\u001b[0;34m(self, how, obj, func)\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(cols) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 618\u001b[0m cols_sorted \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(safe_sort(\u001b[38;5;28mlist\u001b[39m(cols)))\n\u001b[0;32m--> 619\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcols_sorted\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m do not exist\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 621\u001b[0m aggregator_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mdict\u001b[39m)\n\u001b[1;32m 623\u001b[0m \u001b[38;5;66;03m# if we have a dict of any non-scalars\u001b[39;00m\n\u001b[1;32m 624\u001b[0m \u001b[38;5;66;03m# eg. {'A' : ['mean']}, normalize all to\u001b[39;00m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;66;03m# be list-likes\u001b[39;00m\n\u001b[1;32m 626\u001b[0m \u001b[38;5;66;03m# Cannot use func.values() because arg may be a Series\u001b[39;00m\n", + "\u001b[0;31mKeyError\u001b[0m: \"Column(s) ['project_id'] do not exist\"" + ] + } + ], + "source": [ + "all_projects.groupby([\"lead_agency\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n", + " \"project_id\", ascending=False\n", + ").head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "daa0b1d3-4416-4537-b568-bdaae9fd1fdb", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects[\n", + " (all_projects.county == \"Kern\")\n", + " & (all_projects.project_description.str.contains(\"Seal Coat\"))\n", + "].drop(columns=[\"location\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dc906308-31d4-4fde-b492-8218b05cec90", + "metadata": {}, + "outputs": [], + "source": [ + "# all_projects.groupby(['project_category','auto_tagged_project_categories']).agg({'project_id':'nunique'})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d2d6ac3a-c517-4df2-b907-0bac0a09e34a", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.groupby([\"auto_tagged_project_categories\"]).agg(\n", + " {\"project_id\": \"nunique\"}\n", + ").sort_values(\"project_id\", ascending=False).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5150da00-2a30-4f4d-bec8-1d9e5c66d623", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.groupby([\"project_category\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n", + " \"project_id\", ascending=False\n", + ").head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1bf38631-a734-47b0-9465-fcfb8ebafcad", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.groupby([\"project_description\"]).agg(\n", + " {\"project_id\": \"nunique\"}\n", + ").sort_values(\"project_id\", ascending=False).head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5c1baa16-e15c-48e7-9772-ef67755f9d21", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.groupby([\"county\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n", + " \"project_id\", ascending=False\n", + ").head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d55e4ed-9b69-4111-b2ed-69715c9d90c5", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.lead_agency.nunique()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "01a534d9-75e4-4ff8-aa11-99db480de733", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.total_project_cost.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6985e5d0-cf27-423f-8775-16eb3c518beb", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "all_projects.loc[all_projects.fully_funded == \"Fully funded\"].groupby(\n", + " [\"data_source\"]\n", + ").agg({\"project_id\": \"nunique\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3259fc95-2db6-46ad-8cc6-a0357aa19077", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.loc[all_projects.fully_funded == \"Partially funded\"].groupby(\n", + " [\"data_source\"]\n", + ").agg({\"project_id\": \"nunique\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2ef08825-9e29-4268-9172-d0d83e08243b", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.groupby([\"data_source\"]).agg({\"project_id\": \"nunique\"})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5fae701e-4132-4d06-8c27-3e598e072172", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.groupby([\"fully_funded\"]).agg(\n", + " {\"project_id\": \"nunique\"}\n", + ").reset_index().sort_values(\"project_id\", ascending=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "171611d6-acf9-46d8-9814-20534114d43e", + "metadata": {}, + "outputs": [], + "source": [ + "all_projects.groupby([\"data_source\", \"fully_funded\"]).agg({\"project_id\": \"nunique\"})" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/project_list/compile_all_projects.ipynb b/project_list/compile_all_projects.ipynb index 1863ec094..7a7079c4c 100644 --- a/project_list/compile_all_projects.ipynb +++ b/project_list/compile_all_projects.ipynb @@ -2,117 +2,40 @@ "cells": [ { "cell_type": "markdown", - "id": "77106c12-82aa-4be4-8d9c-e66fafec4d67", - "metadata": { - "tags": [] - }, + "id": "a47ae6dd-278c-42d0-a708-f72088e55f51", + "metadata": {}, "source": [ - "## General function to clean up data from various grants\n", - "To-Do\n", - "* De duplicate projects\n", - "* Rearrange counties in County column in alphabetical order.\n", - "* Millions to thousands -> seems easier to read.\n", - "* Differentiate btwn project START year and END year.\n", - "* Add Post Mile column\n", - "\n", - "Done\n", - "* Switch City of Berkeley to Berkeley City. https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb\n", - "\n", - "Strategy/Questions:\n", - "* Make sure one row=one project. How? \n", - "* What should be the unit of project cost?\n", - "* Break up Caltrans by district or leave as is? \n", - "\n", - "Columns/Data Dictionary\n", - "* project_title (str): N/A.\n", - "* lead_agency (str): the entity leading the project or receiving the grant.\n", - "* project_year (TBD): when the project will begin.\n", - "* project_category (str): the category/categories a project belongs to.\n", - "* grant_program (str): the fund a project is receiving funds for. This does not preclude the fact that a project can receive funds from mulitple programs. \n", - "* phase (str): the latest phase the project is in.\n", - "* project_description (str): N/A.\n", - "* total_project_cost_(millions): N/A.\n", - "* total_available_funds_(millions): all the funds available to the project.\n", - "* unfunded_needs_(millions): subtract total_project_cost_(millions) by total_available_funds_(millionis).\n", - "* city (str): the city a project is located in.\n", - "* county (str): the county a project is lcoated in.\n", - "* location (str): an address or more detailed information regarding where the project will take place.\n", - "* geometry: geospatial information.\n", - "* data_source (str): N/A.\n", - "* notes (str): additional information regarding the project.\n", - "* funding_notes (str): additional funding information regarding the project.\n", - "* ct_district (int): the Caltrans district a project is located in.\n", - "* fully_funded (str): comparing total_available_funds_(millions) and total_project_cost_(millions) to figure out whether a project is fully, partially, or not funded.\n", - "* enough_info (str): counting the # of null values and # of strings in the project description to determine whether or not a project has enough information." + "## Compile Projects\n", + "To-do\n", + "* Figure out how to version things b/c projects will get updated and we want to track any changes.\n", + "* This only needs to be done with data from lp2000 and ctips.\n", + "* Need to track changes across all the different dataframes\n", + "* Use merges to figure it out?" ] }, { "cell_type": "code", "execution_count": 1, - "id": "06ac19fe-7b6c-4560-9740-8a4f72c5b6e1", + "id": "bd086d78-ffe5-4cf3-9f70-f5b7f3a5cf40", "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.9/site-packages/geopandas/_compat.py:123: UserWarning: The Shapely GEOS version (3.11.1-CAPI-1.17.1) is incompatible with the GEOS version PyGEOS was compiled with (3.10.1-CAPI-1.16.0). Conversions between both will be slow.\n", - " warnings.warn(\n", - "/home/jovyan/data-analyses/project_list/_sb1_utils.py:1: UserWarning: Shapely 2.0 is installed, but because PyGEOS is also installed, GeoPandas will still use PyGEOS by default for now. To force to use and test Shapely 2.0, you have to set the environment variable USE_PYGEOS=0. You can do this before starting the Python process, or in your code before importing geopandas:\n", - "\n", - "import os\n", - "os.environ['USE_PYGEOS'] = '0'\n", - "import geopandas\n", - "\n", - "In a future release, GeoPandas will switch to using Shapely by default. If you are using PyGEOS directly (calling PyGEOS functions on geometries from GeoPandas), this will then stop working and you are encouraged to migrate from PyGEOS to Shapely 2.0 (https://shapely.readthedocs.io/en/latest/migration_pygeos.html).\n", - " import geopandas as gpd\n" - ] - } - ], + "outputs": [], "source": [ - "import _cleaning_utils\n", - "import _harmonization_utils as harmonization_utils\n", + "import _harmonization_utils as har_utils\n", + "import _lrtp_utils as lrtp_utils\n", + "import _sb1_utils as sb1_utils\n", + "import _specific_list_utils\n", "import _state_rail_plan_utils as srp_utils\n", - "# import geopandas as gpd\n", + "import geopandas as gpd\n", "import pandas as pd\n", - "# import shapely\n", - "from calitp_data_analysis.sql import to_snakecase" + "from calitp_data_analysis.sql import to_snakecase\n", + "import hashlib\n", + "from datetime import datetime" ] }, { "cell_type": "code", "execution_count": 2, - "id": "d78be4e7-2349-4ffd-9d59-f9fa450ae7dd", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "'\\nimport re\\nimport nltk\\nfrom nltk import ngrams\\nfrom nltk.corpus import stopwords\\nfrom nltk.tokenize import sent_tokenize, word_tokenize\\nimport re\\nfrom collections import Counter\\nfrom autocorrect import Speller\\n'" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "\"\"\"\n", - "import re\n", - "import nltk\n", - "from nltk import ngrams\n", - "from nltk.corpus import stopwords\n", - "from nltk.tokenize import sent_tokenize, word_tokenize\n", - "import re\n", - "from collections import Counter\n", - "from autocorrect import Speller\n", - "\"\"\"" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "e7b68eeb-422d-4be8-b557-7bd9e95599af", + "id": "7a1e769e-8af3-4bb8-87c9-e3b1c64c644b", "metadata": {}, "outputs": [], "source": [ @@ -124,629 +47,570 @@ }, { "cell_type": "code", - "execution_count": 4, - "id": "0541b671-a020-485f-9b0a-f46238f1d4f9", + "execution_count": 3, + "id": "9f74e0a0-c1e5-4e42-8d06-11e0ef0464ab", "metadata": {}, "outputs": [], "source": [ - "# lost = harmonization_utils.load_lost()" + "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/project_list/\"" ] }, { - "cell_type": "code", - "execution_count": 5, - "id": "db720477-44f5-4cbd-80ac-a0fe86e47cf9", + "cell_type": "markdown", + "id": "9dc3a31d-a797-4ea0-ad80-a4adb5b4c740", "metadata": {}, - "outputs": [], "source": [ - "def create_notes(df, note_cols: list, new_col_name: str):\n", - " \"\"\"\n", - " Concat multiple columns into one.\n", - " \"\"\"\n", - " prefix = \"_\"\n", - " for column in note_cols:\n", - " df[f\"{prefix}{column}\"] = df[column].astype(str)\n", - " note_cols = [prefix + sub for sub in note_cols]\n", - "\n", - " # https://stackoverflow.com/questions/65532480/how-to-combine-column-names-and-values\n", - " def combine_notes(x):\n", - " return \", \".join([col + \": \" + x[col] for col in note_cols])\n", - "\n", - " df[new_col_name] = df.apply(combine_notes, axis=1)\n", - " df[new_col_name] = df[new_col_name].str.replace(\"_\", \" \")\n", - "\n", - " return df" + "### LRTP/LOST" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "78b5d13c-f4ba-4baf-8c3a-f520a960a44a", + "execution_count": 19, + "id": "67204871-470c-41ec-bf61-6b82b90c88e4", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "96 rows are headers\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:720: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", + " sandag.cost2020m.str.replace(\"$\", \"\")\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "65 rows are headers\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "360 rows are headers\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "68 rows are headers\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", + "/home/jovyan/data-analyses/project_list/_lrtp_utils.py:66: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" + ] + } + ], "source": [ - "# srp = harmonization_utils.load_state_rail_plan()" + "lrtp_lost_df, lrtp_lost_gdf = lrtp_utils.all_mpo(True)" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "f3829bd6-8fc1-4c15-809f-75020248a722", + "execution_count": 21, + "id": "052c4887-4ca7-4ae9-bb15-cdbc26544298", "metadata": {}, "outputs": [], "source": [ - "columns_to_keep = [\n", - " \"project_title\",\n", - " \"lead_agency\",\n", - " \"project_year\",\n", - " \"project_category\",\n", - " \"project_start_year\",\n", - " \"project_completion_year\",\n", - " \"grant_program\",\n", - " \"phase\",\n", - " \"project_description\",\n", - " \"total_project_cost_(millions)\",\n", - " \"total_available_funds_(millions)\",\n", - " \"unfunded_needs_(millions)\",\n", - " \"city\",\n", - " \"county\",\n", - " \"location\",\n", - " \"post_mile\",\n", - " \"geometry\",\n", - " \"data_source\",\n", - " \"notes\",\n", - " \"funding_notes\",\n", - " \"ct_district\",\n", - " \"project_description2\",\n", - "]" + "def unique_project_number(df:pd.DataFrame) -> pd.DataFrame:\n", + " df['timestamp'] = datetime.now().strftime('%Y%m%d%H%M%S')\n", + " df['combo'] = df.apply(lambda row: f\"{row.name}{row['timestamp']}{''.join(map(str, row))}\", axis=1)\n", + " df['project_number'] = df['combo'].apply(lambda x: hashlib.sha256(x.encode('utf-8')).hexdigest()[:12])\n", + " df = df.drop(columns = ['combo', 'timestamp'])\n", + " return df" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "291e821d-9b3f-40a2-bde9-7a12b31eb410", + "execution_count": 22, + "id": "4fb60368-eb54-4c9e-a422-c42f45ad74b0", "metadata": {}, "outputs": [], "source": [ - "def harmonizing(\n", - " df,\n", - " agency_name_col: str,\n", - " project_name_col: str,\n", - " project_description_col: str,\n", - " project_category_col: str,\n", - " phase_col: str,\n", - " project_cost_col: str,\n", - " location_col: str,\n", - " geography_col: str,\n", - " post_mile_col:str,\n", - " county_col: str,\n", - " city_col: str,\n", - " district_col:str, \n", - " project_start_year_col: str,\n", - " project_completion_year_col:str,\n", - " program_col: str,\n", - " data_source: str,\n", - " fund_cols: list,\n", - " notes_cols: list,\n", - " cost_in_millions: bool = True,\n", - "):\n", - " \"\"\"\n", - " Take a dataset and change the column names/types to\n", - " default names and formats.\n", - " \"\"\"\n", - " rename_columns = {\n", - " agency_name_col: \"lead_agency\",\n", - " project_name_col: \"project_title\",\n", - " project_description_col: \"project_description\",\n", - " project_category_col: \"project_category\",\n", - " project_cost_col: \"total_project_cost_(millions)\",\n", - " location_col: \"location\",\n", - " geography_col: \"geometry\",\n", - " phase_col: \"phase\",\n", - " post_mile_col: \"post_mile\",\n", - " county_col: \"county\",\n", - " city_col: \"city\",\n", - " district_col: \"ct_district\",\n", - " project_start_year_col: \"project_start_year\",\n", - " project_end_year_col: \"project_completion_year\",\n", - " program_col: \"grant_program\",\n", - " }\n", - " # Rename columns\n", - " df = df.rename(columns=rename_columns)\n", - " \n", - " # Clean up monetary columns to be interger\n", - " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n", - " for i in cost_columns:\n", - " df[i] = df[i].apply(pd.to_numeric, errors=\"coerce\").fillna(0)\n", - " \n", - " # Divide cost columns by millions\n", - " # If bool is set to True\n", - " if cost_in_millions:\n", - " for i in fund_cols + [\"total_project_cost_(millions)\"]:\n", - " df[i] = df[i].divide(1_000_000)\n", - "\n", - " # Add new column with funding breakout\n", - " # Since it's summarized above and the details are suppressed.\n", - " df[\"total_available_funds_(millions)\"] = df[fund_cols].sum(axis=1)\n", - " df = create_notes(df, fund_cols, \"funding_notes\")\n", - " \n", - " # Add column for unfunded needs\n", - " df[\"unfunded_needs_(millions)\"] = df[\"total_project_cost_(millions)\"] - df[\"total_available_funds_(millions)\"]\n", - " \n", - " # Add program\n", - " df[\"data_source\"] = data_source\n", - " \n", - " # Create columns even if they don't exist, just to harmonize\n", - " # before concatting.\n", - " create_columns = [\n", - " \"county\",\n", - " \"city\",\n", - " \"notes\",\n", - " \"project_start_year\",\n", - " \"project_completion_year\",\n", - " \"post_mile\",\n", - " \"project_category\",\n", - " \"location\",\n", - " \"phase\",\n", - " \"ct_district\"\n", - " ]\n", - " for column in create_columns:\n", - " if column not in df:\n", - " df[column] = \"None\"\n", - " if \"geometry\" not in df:\n", - " df[\"geometry\"] = None\n", - " if \"grant_program\" not in df:\n", - " df[\"grant_program\"] = data_source\n", - " \n", - " # Create notes - aka other columns that were supressed\n", - " df = create_notes(df, notes_cols, \"notes\")\n", - " \n", - " # Clean up string columns\n", - " string_cols = df.select_dtypes(include=[\"object\"]).columns.to_list()\n", - " for i in string_cols:\n", - " df[i] = df[i].str.replace(\"_\", \" \").str.strip().str.title()\n", - "\n", - " # Fill in any nulls\n", - " df['project_description2'] = df.project_description.fillna(df.project_title)\n", - " df = df.fillna(df.dtypes.replace({\"float64\": 0.0, \"object\": \"None\"}))\n", - "\n", - " # Only keep certain columns\n", - " df = df[columns_to_keep]\n", - " return df" + "lrtp_lost_df = unique_project_number(lrtp_lost_df)" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "0ea5badb-841b-4941-b48f-23d750b5ed27", + "execution_count": 23, + "id": "846f7621-4999-4a7d-b2f7-ec0a18b19ac5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "16276" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def harmonize_srp():\n", - " df = harmonization_utils.load_state_rail_plan()\n", - " df = harmonizing(\n", - " df,\n", - " agency_name_col=\"lead_agency\",\n", - " project_name_col=\"project_name\",\n", - " project_description_col=\"project_description\",\n", - " project_category_col=\"project_category\",\n", - " phase_col=\"\",\n", - " project_cost_col=\"total_project_cost\",\n", - " location_col=\"corridor\",\n", - " geography_col=\"\",\n", - " county_col=\"\",\n", - " city_col=\"\",\n", - " district_col=\"\",\n", - " project_year_col=\"\",\n", - " program_col=\"\",\n", - " data_source=\"State Rail Plan\",\n", - " fund_cols=[],\n", - " notes_cols = ['project_time_horizon','srp_region', \n", - " 'sub_corridor_node_1', 'sub_corridor_node_2', 'itsp_corridor'],\n", - " cost_in_millions=True,\n", - " )\n", - "\n", - " return df" + "lrtp_lost_df.project_number.nunique()" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "2b60a4e7-cc69-41fb-9285-c32f9fa0791e", + "execution_count": 55, + "id": "d5cd997b-dc72-4f04-9950-158fe13c25fb", "metadata": {}, "outputs": [], "source": [ - "# srp_harmonized = harmonize_srp()" + "def separate_out_df(df:pd.DataFrame, columns_to_keep: list)-> pd.DataFrame:\n", + " \"\"\"\n", + " Subset the column, drop any rows \n", + " in which the values are Nan or \"None.\"\n", + " \"\"\"\n", + " # Subset\n", + " df2 = df[columns_to_keep]\n", + " \n", + " # Fill in missing values\n", + " try:\n", + " df2 = df2.fillna('none')\n", + " except:\n", + " df2\n", + " \n", + " # Remove project_number and keep only cols of interest\n", + " columns_to_keep.remove('project_number')\n", + " \n", + " # Drop rows that are nan or \"None\" based on how many columns are listed\n", + " if len(columns_to_keep) == 1:\n", + " df2 = df2.dropna(how = \"any\")\n", + " df2 = df2[df2.applymap(lambda x: x.lower() if isinstance(x, str) else x) != 'none'].dropna()\n", + " \n", + " # If there are more than one column to separate out,\n", + " # keep any row that has a non-null value \n", + " else:\n", + " df2 = df2.dropna(how = \"all\", subset = columns_to_keep)\n", + " return df2" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "2100f3b7-55c8-45ad-b3d1-99a0319c7ac8", + "execution_count": 25, + "id": "8acf0ccd-3b35-41db-9320-3352b9b4e813", "metadata": {}, "outputs": [], "source": [ - "# srp_harmonized.tail()" + "lrtp_lost_county = separate_out_df(lrtp_lost_df, ['project_number', 'county'])" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "3142aacb-d5f4-4bc1-8cc8-99f50c45b301", + "execution_count": 26, + "id": "c75b08c0-4da0-44d0-b675-dc848240b994", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(16276, 11)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# srp_og = harmonization_utils.load_state_rail_plan()" + "lrtp_lost_df.shape" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "0ae0b8bd-3e5b-4119-8fee-d496689f9c7c", + "execution_count": 27, + "id": "fce5ffc2-1b46-4f8d-bbb1-2a9251d35a8c", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(4012, 2)" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# srp_og.sample()" + "lrtp_lost_county.shape" ] }, { "cell_type": "code", - "execution_count": 14, - "id": "9307a340-c699-4d93-ba30-abe04563dd8d", + "execution_count": 28, + "id": "71ff78d9-cafc-4bc4-bc68-2f0e001a5acd", "metadata": {}, "outputs": [], "source": [ - "# srp_og.columns" + "lrtp_lost_city = separate_out_df(lrtp_lost_df, ['project_number', 'city'])" ] }, { "cell_type": "code", - "execution_count": 15, - "id": "23804222-466a-4754-a1ad-fd8f3f8a5239", + "execution_count": 29, + "id": "4116f424-755e-4382-98c7-c3f0c65c5514", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(745, 2)" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def harmonize_lost():\n", - " df = harmonization_utils.load_lost()\n", - " df = harmonizing(\n", - " df,\n", - " agency_name_col=\"agency\",\n", - " project_name_col=\"project_title\",\n", - " project_description_col=\"project_description\",\n", - " project_category_col=\"project_category\",\n", - " project_cost_col=\"cost__in_millions_\",\n", - " phase_col=\"\",\n", - " location_col=\"location\",\n", - " geography_col=\"\",\n", - " county_col=\"county\",\n", - " city_col=\"city\",\n", - " district_col = \"\",\n", - " project_year_col=\"\",\n", - " program_col=\"measure\",\n", - " data_source=\"Local Options Sales Tax\",\n", - " fund_cols=[\n", - " \"estimated_lost_funds\",\n", - " \"estimated_federal_funds\",\n", - " \"estimated_state_funds\",\n", - " \"estimated_local_funds\",\n", - " \"estimated_other_funds\",\n", - " ],\n", - " notes_cols = [\"notes\"],\n", - " cost_in_millions=False,\n", - " )\n", - "\n", - " return df" + "lrtp_lost_city.shape" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "e13f87d5-514f-404f-8cc8-4dbf877754da", + "execution_count": 30, + "id": "829f6817-f0f7-4193-9ce8-f9732baff8d8", "metadata": {}, "outputs": [], "source": [ - "# lost_og = harmonization_utils.load_lost()" + "lrtp_lost_agency = separate_out_df(lrtp_lost_df, ['project_number', 'lead_agency'])" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "60d66f94-7915-43b7-990e-896600e20d40", + "execution_count": 31, + "id": "78df708c-82cf-44ae-99f4-dd84e7c40dc1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(16276, 2)" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# lost_og.columns" + "lrtp_lost_agency.shape" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "20e8a81a-e6b1-4bdf-a0f8-21420c62b68a", + "execution_count": 32, + "id": "dd61d79c-9999-4c11-9353-47ac4f16b8b1", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_numberlead_agency
618d1631fce5bcSlocog
\n", + "
" + ], + "text/plain": [ + " project_number lead_agency\n", + "61 8d1631fce5bc Slocog" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def harmonize_sb1():\n", - " df = harmonization_utils.load_sb1()\n", - " df = harmonizing(\n", - " df,\n", - " agency_name_col=\"implementingagency\",\n", - " project_name_col=\"projecttitle_x\",\n", - " project_description_col=\"projectdescription\",\n", - " project_category_col=\"\",\n", - " phase_col=\"projectstatuses\",\n", - " project_cost_col=\"totalcost\",\n", - " location_col=\"\",\n", - " geography_col=\"geometry\",\n", - " county_col=\"countynames\",\n", - " city_col=\"citynames\",\n", - " district_col = \"ct_districts\",\n", - " project_year_col=\"fiscalyears\",\n", - " program_col=\"programcodes\",\n", - " data_source=\"SB1\",\n", - " fund_cols=[\"sb1funds\", \"iijafunds\"],\n", - " notes_cols = ['iijaprogram','dateupdated','isonshs', 'isonshscodes','agencies', 'popup'],\n", - " cost_in_millions=True,\n", - " )\n", - "\n", - " return df" + "lrtp_lost_agency.sample()" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "4d39d086-ef36-4f21-ab44-17980304be74", + "execution_count": 33, + "id": "5c3dfa8a-480a-4062-b59c-0ce5dff75cdc", "metadata": {}, "outputs": [], "source": [ - "# sb1_og = harmonization_utils.load_sb1()" + "lrtp_lost_geo = separate_out_df(lrtp_lost_df, ['project_number', 'geometry'])" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "9c65170e-17ef-42da-b161-358e40f815a8", + "execution_count": 34, + "id": "40998366-218c-4c9b-b890-dc97769e3893", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "((1357, 2), (1355, 10))" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# sb1_og.columns" + "lrtp_lost_geo.shape, lrtp_lost_gdf.shape" ] }, { "cell_type": "code", - "execution_count": 21, - "id": "5ade46ae-4768-4855-b0ea-9ff4ec7607af", + "execution_count": 35, + "id": "92fe5891-014e-4177-beb6-1e83c957d0fb", "metadata": {}, "outputs": [], "source": [ - "# sb1_og.drop(columns = ['geometry']).sample(3)" + "lrtp_to_drop = ['county', 'city', 'lead_agency', 'geometry']" ] }, { "cell_type": "code", - "execution_count": 22, - "id": "5cab53c4-9c63-4bd4-b837-f43e62900e8d", + "execution_count": 36, + "id": "29ca0b11-6244-455b-96b2-dd53144d7c0e", "metadata": {}, "outputs": [], "source": [ - "# harmonized_sb1 = harmonize_sb1()" - ] - }, - { - "cell_type": "markdown", - "id": "6c14df71-56af-43a1-b0e0-2d02ef38e18e", - "metadata": {}, - "source": [ - "### Stacking" - ] - }, - { - "cell_type": "markdown", - "id": "53a8e2a2-9d49-4e55-a2ee-bd6224d7fb61", - "metadata": { - "tags": [] - }, - "source": [ - "#### Does this project have enough information to be useful?" + "# lrtp_lost_df = lrtp_lost_df.drop(columns = lrtp_to_drop)" ] }, { "cell_type": "code", - "execution_count": 23, - "id": "3926aa77-991b-48be-b57d-04077a0a485b", + "execution_count": 37, + "id": "3a475324-4f2d-41a3-8b1f-ac4f0eaf0962", "metadata": {}, - "outputs": [], - "source": [ - "def categorize_info(df): \n", - " \n", - " #Get percentiles in objects for total vehicle.\n", - " p50_project_desc= df.project_description_count.quantile(0.50).astype(float)\n", - " p50_null_values = df.total_percent_null_values.quantile(0.50).astype(float)\n", - " \n", - " #Function for fleet size\n", - " def percentile_info (row):\n", - " if ((row.project_description_count >= p50_project_desc) and (row.total_percent_null_values <= p50_null_values)):\n", - " return \"Yes\"\n", - " else: \n", - " return \"No\"\n", - " df[\"enough_info\"] = df.apply(lambda x: percentile_info(x), axis=1)\n", - " \n", - " return df " - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "48854cb1-3fa8-4d4e-8e8f-7218fc8b9c7e", - "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_titlelead_agencyproject_yearproject_descriptiontotal_project_costgeometrycitycountydata_sourcenotesproject_number
416NoneStancog2022-2046Bicycle Lane (Class 2), Buffered Bicycle Lane (Class 2), Bicycle Route With Wide Shoulders (Class 3.5), Separated Bike Lane (Class 4), And Pedestrian Improvements.\\n(Non-Motorized Transportation Plan Top 25: Route 25)8027400.00NoneNoneNoneStancog LrtpJurisdiction: Stanislaus County, Location: Rhode Rd/7Th St/Nunes Rd/N. Golden State Blvd, Project Limits: Moore Rd To W. Christofferson Pkwy, Funding Source: Atp, Sb 1, Bil/Iija, Cmaq, Stbgp, System Preserv : Nan, Capacity Enhance : Nan, Safety: Nan, Oper : Nan, Complete Streets: Nan, Active\\nTransporta Tion: X, Transit: Nan, Other: Nana32f75c83b70
\n", + "
" + ], + "text/plain": [ + " project_title lead_agency project_year \\\n", + "416 None Stancog 2022-2046 \n", + "\n", + " project_description \\\n", + "416 Bicycle Lane (Class 2), Buffered Bicycle Lane (Class 2), Bicycle Route With Wide Shoulders (Class 3.5), Separated Bike Lane (Class 4), And Pedestrian Improvements.\\n(Non-Motorized Transportation Plan Top 25: Route 25) \n", + "\n", + " total_project_cost geometry city county data_source \\\n", + "416 8027400.00 None None None Stancog Lrtp \n", + "\n", + " notes \\\n", + "416 Jurisdiction: Stanislaus County, Location: Rhode Rd/7Th St/Nunes Rd/N. Golden State Blvd, Project Limits: Moore Rd To W. Christofferson Pkwy, Funding Source: Atp, Sb 1, Bil/Iija, Cmaq, Stbgp, System Preserv : Nan, Capacity Enhance : Nan, Safety: Nan, Oper : Nan, Complete Streets: Nan, Active\\nTransporta Tion: X, Transit: Nan, Other: Nan \n", + "\n", + " project_number \n", + "416 a32f75c83b70 " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def enough_info(df):\n", - " # Select string columns\n", - " string_cols = df.select_dtypes(include=[\"object\"]).columns.to_list()\n", - " \n", - " # https://stackoverflow.com/questions/73839250/count-number-of-occurrences-of-text-over-row-python-pandas\n", - " # Count \"nones\" in string columns\n", - " df['none_counts'] = df[string_cols].astype(str).sum(axis=1).str.lower().str.count(\"none\")\n", - " \n", - " # Count zeroes\n", - " df['zero_counts'] = (df == 0).astype(int).sum(axis=1)\n", - " \n", - " # Total up all none/zeroes \n", - " df[\"total_percent_null_values\"] = df[['none_counts','zero_counts']].sum(axis=1)/len(df.columns) * 100\n", - " \n", - " # Count project descriptions\n", - " df[\"project_description_count\"] = df[\"project_description\"].str.count('\\w+')\n", - " \n", - " # Categorize whether it has enough info or not\n", - " df = categorize_info(df)\n", - " \n", - " # Compress columns to retain some info\n", - " df['counts'] = 'number of strings in project desc: ' + df.project_description_count.astype(str) + ' % of null values:' + df.total_percent_null_values.astype(int).astype(str)\n", - " \n", - " df = df.drop(columns = ['none_counts','zero_counts','project_description_count','total_percent_null_values'])\n", - " return df " + "lrtp_lost_df.sample()" ] }, { "cell_type": "markdown", - "id": "91c4e4b0-f28d-4956-9274-d17a3306801e", - "metadata": { - "tags": [] - }, - "source": [ - "#### Correct lead agencies again" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "id": "f0f564ce-5551-4750-94b6-bb7c5b056949", + "id": "0d3280f2-67c1-418d-8161-4a577b9d3034", "metadata": {}, - "outputs": [], - "source": [ - "def flip_county_city(df, agency_col:str):\n", - " # https://github.com/cal-itp/data-analyses/blob/main/Agreement_Overlap/add_dla.ipynb\n", - " to_correct = df[(df[agency_col].str.contains('County')) | (df[agency_col].str.contains('City'))]\n", - " to_correct = to_correct[[agency_col]].drop_duplicates().reset_index(drop = True)\n", - " to_correct['str_len'] = to_correct[agency_col].str.split().str.len()\n", - " to_correct = to_correct[to_correct.str_len <= 5 ].reset_index(drop = True)\n", - " to_correct[['name_pt1', 'name_pt2']] = to_correct[agency_col].str.split(' Of ', 1, expand=True)\n", - " to_correct['new_name'] = to_correct['name_pt2'] + ' ' + to_correct['name_pt1']\n", - " \n", - " new_names_dictionary = (dict(to_correct[[agency_col, 'new_name']].values))\n", - " df['agency_corrected'] = df[agency_col].map(new_names_dictionary)\n", - " df['agency_corrected'] = df['agency_corrected'].fillna(df[agency_col])\n", - " \n", - " df = df.drop(columns = [agency_col])\n", - " df = df.rename(columns = {\"agency_corrected\":agency_col})\n", - " \n", - " return df " - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "id": "3946f71e-f987-452b-8269-331d6cb461c7", - "metadata": { - "scrolled": true, - "tags": [] - }, - "outputs": [], "source": [ - "# all_projects_metric.lead_agency.value_counts()" + "### SB1" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "0bd2a79a-700b-446e-8346-5aa6fb2309f8", - "metadata": {}, - "outputs": [], - "source": [ - "def add_all_projects():\n", - "\n", - " # Load dataframes\n", - " state_rail_plan = harmonize_srp()\n", - " lost = harmonize_lost()\n", - " sb1 = harmonize_sb1()\n", - "\n", - " # Concat for df\n", - " df = pd.concat([lost, state_rail_plan, sb1])\n", - " \n", - " # Clean agency names\n", - " df = harmonization_utils.organization_cleaning(df, \"lead_agency\")\n", - " df = flip_county_city(df, 'lead_agency')\n", - " \n", - " # Determine if the project completely funded or not?\n", - " # Add up all available funds\n", - " df[\"fully_funded\"] = df.apply(harmonization_utils.funding_vs_expenses, axis=1)\n", - " \n", - " # Does this project have enough info?\n", - " df = enough_info(df)\n", - " \n", - " \n", - " return df" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "5dcafef7-30b9-4582-93c8-188ede6b8562", + "execution_count": 38, + "id": "6d530178-4f39-475b-b533-822b0b19f237", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - "/home/jovyan/data-analyses/project_list/_sb1_utils.py:30: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", - "/home/jovyan/data-analyses/project_list/_sb1_utils.py:37: FutureWarning: The default value of regex will change from True to False in a future version.\n", - "/home/jovyan/data-analyses/project_list/_sb1_utils.py:30: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", - "/tmp/ipykernel_1728/2284675246.py:44: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", - "/home/jovyan/data-analyses/project_list/_harmonization_utils.py:34: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n" + "/home/jovyan/data-analyses/project_list/_sb1_utils.py:23: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", + " gdf[i]\n", + "/home/jovyan/data-analyses/project_list/_sb1_utils.py:23: FutureWarning: The default value of regex will change from True to False in a future version.\n", + " gdf[i]\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0 rows are mising geometry\n", + "7917 rows contain invalid geography\n" ] } ], "source": [ - "all_projects = add_all_projects()" + "sb1_df = sb1_utils.load_sb1()" ] }, { "cell_type": "code", - "execution_count": 27, - "id": "64c6a43d-0a8c-4f7c-a3cc-df3415163bf4", + "execution_count": 40, + "id": "7ee96e11-1d68-4991-8089-5749adacb311", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['project_title', 'lead_agency', 'project_year', 'project_category',\n", - " 'grant_program', 'phase', 'project_description',\n", - " 'total_project_cost_(millions)', 'total_available_funds_(millions)',\n", - " 'unfunded_needs_(millions)', 'city', 'county', 'location', 'geometry',\n", - " 'data_source', 'notes', 'funding_notes', 'ct_district',\n", - " 'project_description2', 'fully_funded', 'enough_info', 'counts'],\n", - " dtype='object')" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "all_projects.columns" + "sb1_df = unique_project_number(sb1_df)" ] }, { "cell_type": "code", - "execution_count": 53, - "id": "78825d55-c1b0-447b-b33e-493c7165aa25", - "metadata": { - "scrolled": true, - "tags": [] - }, + "execution_count": 76, + "id": "931ef88d-caf9-4201-bc18-d874197be065", + "metadata": {}, "outputs": [ { "data": { @@ -769,1853 +633,1019 @@ " \n", " \n", " \n", - " project_title\n", - " lead_agency\n", - " project_year\n", - " project_category\n", - " grant_program\n", - " phase\n", - " project_description\n", - " total_project_cost_(millions)\n", - " total_available_funds_(millions)\n", - " unfunded_needs_(millions)\n", - " city\n", - " county\n", - " location\n", - " data_source\n", - " notes\n", - " funding_notes\n", - " ct_district\n", - " project_description2\n", - " fully_funded\n", - " enough_info\n", - " counts\n", + " projectid\n", + " projname\n", + " projcatcode\n", + " projcategory\n", + " projprogcode\n", + " projprogram\n", + " multiprogfunded\n", + " projstatus\n", + " description\n", + " cost\n", + " assemblydistrict\n", + " senatedistrict\n", + " assemblycode\n", + " senatecode\n", + " countyname\n", + " cityname\n", + " countycode\n", + " citycode\n", + " appagencyname\n", + " impagencyname\n", + " geometry\n", + " totalcosts\n", + " routes\n", + " constyear\n", + " costfull\n", + " projagency\n", + " project_number\n", + " assembly_same\n", + " senate_same\n", " \n", " \n", " \n", " \n", - " 358\n", - " None\n", + " 3449\n", + " LsrFy17185261Pp030\n", + " Slurry Seal Parker Avenue\n", + " Local\n", + " Local And Regional\n", + " Lsr1718\n", + " 201718 Local Streets And Roads\n", + " NaN\n", + " PreConstruction\n", + " Slurry Seal 16324 Sf For Roadway With 69 Pci\n", + " NaN\n", + " 24\n", + " 13\n", + " 24\n", + " 13\n", + " San Mateo\n", + " Atherton Submitted By City\n", + " Sm\n", + " Atn\n", + " NaN\n", + " NaN\n", " None\n", - " 19/20, 20/21\n", - " None\n", - " Sgr\n", - " In Progress, Planned\n", - " None\n", - " 0.12\n", - " 0.12\n", - " 0.00\n", - " Corcoran\n", - " Kings\n", - " None\n", - " Sb1\n", - " Iijaprogram: , Dateupdated: 2021-09-09, Isonshs: N, Isonshscodes: N, Agencies: City Of Corcoran, Popup: None\n", - " Sb1Funds: 0.121909, Iijafunds: 0.0\n", - " None\n", - " None\n", - " Fully funded\n", - " No\n", - " number of strings in project desc: 1 % of null values:40\n", - " \n", - " \n", - " 1085\n", - " Spring Street Overlay\n", - " City Of Signal Hill\n", - " 19/20\n", - " None\n", - " Lsr\n", - " Planned\n", - " None\n", - " 3.00\n", - " 0.13\n", - " 2.87\n", - " Signal Hill\n", - " Los Angeles\n", - " None\n", - " Sb1\n", - " Iijaprogram: , Dateupdated: 6/30/2021, Isonshs: N, Isonshscodes: N, Agencies: City Of Signal Hill, Popup: None\n", - " Sb1Funds: 0.126705, Iijafunds: 0.0\n", - " None\n", - " None\n", - " Partially funded\n", - " No\n", - " number of strings in project desc: 1 % of null values:27\n", - " \n", - " \n", - " 2106\n", - " Major Damage Restoration\n", - " Caltrans\n", - " 20/21\n", - " None\n", - " Shopp\n", - " In Progress\n", - " A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.\n", - " 16.52\n", - " 9.08\n", - " 7.44\n", - " None\n", - " Del Norte\n", - " None\n", - " Sb1\n", - " Iijaprogram: State Hwy Operations & Protection Program Major-Federal, Dateupdated: 2022-06-28, Isonshs: None, Isonshscodes: Y, Agencies: Caltrans, Popup: Major Damage Restorationbr\n", - " Sb1Funds: 0.0, Iijafunds: 9.083566\n", - " 01\n", - " A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System.\n", - " Partially funded\n", - " Yes\n", - " number of strings in project desc: 25 % of null values:18\n", + " NaN\n", + " NaN\n", + " 2018\n", + " NaN\n", + " Atherton City\n", + " c61c443fb21d\n", + " True\n", + " True\n", " \n", " \n", "\n", "" ], "text/plain": [ - " project_title lead_agency project_year \\\n", - "358 None None 19/20, 20/21 \n", - "1085 Spring Street Overlay City Of Signal Hill 19/20 \n", - "2106 Major Damage Restoration Caltrans 20/21 \n", - "\n", - " project_category grant_program phase \\\n", - "358 None Sgr In Progress, Planned \n", - "1085 None Lsr Planned \n", - "2106 None Shopp In Progress \n", + " projectid projname projcatcode \\\n", + "3449 LsrFy17185261Pp030 Slurry Seal Parker Avenue Local \n", "\n", - " project_description \\\n", - "358 None \n", - "1085 None \n", - "2106 A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System. \n", + " projcategory projprogcode projprogram \\\n", + "3449 Local And Regional Lsr1718 201718 Local Streets And Roads \n", "\n", - " total_project_cost_(millions) total_available_funds_(millions) \\\n", - "358 0.12 0.12 \n", - "1085 3.00 0.13 \n", - "2106 16.52 9.08 \n", + " multiprogfunded projstatus \\\n", + "3449 NaN PreConstruction \n", "\n", - " unfunded_needs_(millions) city county location \\\n", - "358 0.00 Corcoran Kings None \n", - "1085 2.87 Signal Hill Los Angeles None \n", - "2106 7.44 None Del Norte None \n", + " description cost assemblydistrict \\\n", + "3449 Slurry Seal 16324 Sf For Roadway With 69 Pci NaN 24 \n", "\n", - " data_source \\\n", - "358 Sb1 \n", - "1085 Sb1 \n", - "2106 Sb1 \n", + " senatedistrict assemblycode senatecode countyname \\\n", + "3449 13 24 13 San Mateo \n", "\n", - " notes \\\n", - "358 Iijaprogram: , Dateupdated: 2021-09-09, Isonshs: N, Isonshscodes: N, Agencies: City Of Corcoran, Popup: None \n", - "1085 Iijaprogram: , Dateupdated: 6/30/2021, Isonshs: N, Isonshscodes: N, Agencies: City Of Signal Hill, Popup: None \n", - "2106 Iijaprogram: State Hwy Operations & Protection Program Major-Federal, Dateupdated: 2022-06-28, Isonshs: None, Isonshscodes: Y, Agencies: Caltrans, Popup: Major Damage Restorationbr \n", + " cityname countycode citycode appagencyname \\\n", + "3449 Atherton Submitted By City Sm Atn NaN \n", "\n", - " funding_notes ct_district \\\n", - "358 Sb1Funds: 0.121909, Iijafunds: 0.0 None \n", - "1085 Sb1Funds: 0.126705, Iijafunds: 0.0 None \n", - "2106 Sb1Funds: 0.0, Iijafunds: 9.083566 01 \n", + " impagencyname geometry totalcosts routes constyear costfull \\\n", + "3449 NaN None NaN NaN 2018 NaN \n", "\n", - " project_description2 \\\n", - "358 None \n", - "1085 None \n", - "2106 A $16.52 Million Dollar Project In Del Norte County On Route 101 Will Realign Roadway, Construct Retaining Walls, And Place A Video Monitoring System. \n", - "\n", - " fully_funded enough_info \\\n", - "358 Fully funded No \n", - "1085 Partially funded No \n", - "2106 Partially funded Yes \n", - "\n", - " counts \n", - "358 number of strings in project desc: 1 % of null values:40 \n", - "1085 number of strings in project desc: 1 % of null values:27 \n", - "2106 number of strings in project desc: 25 % of null values:18 " + " projagency project_number assembly_same senate_same \n", + "3449 Atherton City c61c443fb21d True True " ] }, - "execution_count": 53, + "execution_count": 76, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_projects.drop(columns = ['geometry']).sample(3)" + "sb1_df.sample()" ] }, { "cell_type": "code", - "execution_count": 29, - "id": "0983ba29-f492-4a1a-ad40-78ebd291f7d2", + "execution_count": 41, + "id": "e37cf474-f657-4dbb-8969-23e14001ac84", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Shopp 1631\n", - "Imperial D 2008 726\n", - "Hm 520\n", - "Lsr 285\n", - "State Rail Plan 276\n", - "Atp 216\n", - "Sgr 156\n", - "Stip 126\n", - "San Mateo W 2018 91\n", - "Los Angeles Angeles M 2016 89\n", - "San Benito G 2004 86\n", - "Santa Clara B 2016 85\n", - "Tircp 82\n", - "Shopa 79\n", - "San Mateo A2 2006 78\n", - "Alameda B 2000 62\n", - "San Diego A 2004 59\n", - "San Joaquin K 2003 56\n", - "Tcep 55\n", - "San Bernardino I2 2018 51\n", - "Sacramento A2 2004 51\n", - "Tulare R 2006 49\n", - "Sta 49\n", - "Sonoma M 2004 44\n", - "Alameda Bb 2014 40\n", - "Lpp-F 40\n", - "Santa Barbara A 2008 37\n", - "Los Angeles Angeles R 2008 37\n", - "Madera T 2006 36\n", - "Sccp 34\n", - "San Francisco K 2004 28\n", - "Riverside A2 2006 27\n", - "Lpp-C 21\n", - "Stanislaus L 2016 20\n", - "Contra Costa J 2004 19\n", - "Orange M2 2002 19\n", - "Santa Clara A 2000 14\n", - "Sra 11\n", - "Monterey X 2016 11\n", - "Santa Cruz D 2016 9\n", - "Marin A 2004 7\n", - "Monterey Salinas Transit Q 2016 6\n", - "Sonoma Q 2008 5\n", - "Fresno C 2006 5\n", - "Los Angelest Alameda Bb 2014 1\n", - "Santa Clara B 2008 1\n", - "Name: grant_program, dtype: int64" + "9186" ] }, - "execution_count": 29, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_projects.grant_program.value_counts()" + "len(sb1_df)" ] }, { "cell_type": "code", - "execution_count": 30, - "id": "301b9bde-499e-445d-a27c-f50f522e4aa9", + "execution_count": 42, + "id": "eb4314bc-8d4f-4a1b-8425-034091a4f57f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "Sb1 3305\n", - "Local Options Sales Tax 1849\n", - "State Rail Plan 276\n", - "Name: data_source, dtype: int64" + "9186" ] }, - "execution_count": 30, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_projects.data_source.value_counts()" + "sb1_df.project_number.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "b09c99bf-a44e-4c8d-a9fb-fb3a86ffa1eb", + "metadata": {}, + "source": [ + "#### Check that assemblydistrict and assemblycode are the same values" ] }, { "cell_type": "code", - "execution_count": 31, - "id": "0c066920-6b09-4584-bc82-4f88b41e00d8", + "execution_count": 43, + "id": "2ec3775a-1d30-4f88-bea1-37f55a29e767", + "metadata": {}, + "outputs": [], + "source": [ + "sb1_df['assembly_same'] = sb1_df.assemblycode == sb1_df.assemblydistrict" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "41b4f512-89a6-4a08-b0bd-f1ed309d9c34", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "0.00 20.06\n", - "0.33 2.65\n", - "0.25 1.25\n", - "7.61 0.85\n", - "17.86 0.77\n", - "Name: total_project_cost_(millions), dtype: float64" + "True 7217\n", + "False 1969\n", + "Name: assembly_same, dtype: int64" ] }, - "execution_count": 31, + "execution_count": 44, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_projects[\"total_project_cost_(millions)\"].value_counts().head() / len(all_projects) * 100" + "sb1_df.assembly_same.value_counts()" ] }, { "cell_type": "code", - "execution_count": 52, - "id": "413ac763-c08b-48b0-91d5-6e53fd8f2c32", + "execution_count": 45, + "id": "76325c35-f25b-4635-8601-3f576557abee", + "metadata": {}, + "outputs": [], + "source": [ + "sb1_df['senate_same'] = sb1_df.senatedistrict == sb1_df.senatecode" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "5962a32a-c9d2-4373-8e05-329b62b42256", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "No available funding info 1963\n", - "Partially funded 1796\n", - "No project cost info 1089\n", - "Fully funded 582\n", - "Name: fully_funded, dtype: int64" + "True 6950\n", + "False 2236\n", + "Name: senate_same, dtype: int64" ] }, - "execution_count": 52, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_projects.fully_funded.value_counts()" + "sb1_df.senate_same.value_counts()" ] }, { - "cell_type": "markdown", - "id": "8add5491-77d7-4eaa-ad79-57072f7eddd9", + "cell_type": "code", + "execution_count": 47, + "id": "0a1360f5-eb9a-49a9-947e-249f3d8e9b99", "metadata": {}, + "outputs": [], "source": [ - "### Metrics\n", - "* Rewrite to be shorter?\n", - "* Correct spelling of descriptions?\n", - "* https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305" + "sb1_county = separate_out_df(sb1_df, ['project_number', 'countyname'])" ] }, { "cell_type": "code", - "execution_count": 65, - "id": "6d6253cd-b5f8-4431-a575-9a274e6e8bae", + "execution_count": 48, + "id": "0330602b-7b39-48c9-835f-d7cbbd4fcd3f", "metadata": {}, "outputs": [], "source": [ - "def add_categories(df):\n", - " \"\"\"\n", - " Create general categories for each projects.\n", - " https://github.com/cal-itp/data-analyses/blob/29ed3ad1d107c6be09fecbc1a5f3d8ef5f2b2da6/dla/dla_utils/clean_data.py#L305\n", - " \"\"\"\n", - " # There are many projects that are \n", - " ACTIVE_TRANSPORTATION = ['bike', 'bicycle', 'cyclist', \n", - " 'pedestrian', \n", - " ## including the spelling errors of `pedestrian`\n", - " 'pedestrain',\n", - " 'crosswalk', \n", - " 'bulb out', 'bulb-out', \n", - " 'active transp', 'traffic reduction', \n", - " 'speed reduction', 'ped', 'srts', \n", - " 'safe routes to school',\n", - " 'sidewalk', 'side walk', 'Cl ', 'trail',\n", - " 'atp'\n", - " ]\n", - " TRANSIT = ['bus', 'metro', 'station', #Station comes up a few times as a charging station and also as a train station\n", - " 'transit','fare', 'brt', 'yarts', 'railroad', 'highway-rail'\n", - " # , 'station' in description and 'charging station' not in description\n", - " ] \n", - " BRIDGE = [\"bridge\", 'viaduct']\n", - " STREET = ['traffic signal', 'resurface', 'resurfacing', 'slurry', 'seal' \n", - " 'sign', 'stripe', 'striping', 'median', \n", - " 'guard rail', 'guardrail', \n", - " 'road', 'street', \n", - " 'sinkhole', 'intersection', 'signal', 'curb',\n", - " 'light', 'tree', 'pavement', 'roundabout'\n", - " ]\n", - "\n", - " FREEWAY = ['hov ', 'hot ', 'freeway', 'highway', 'express lanes', 'hwy']\n", - "\n", - " INFRA_RESILIENCY_ER = ['repair', 'emergency', 'replace','retrofit', 'er',\n", - " 'rehab', 'improvements', 'seismic', 'reconstruct', 'restoration']\n", - "\n", - " CONGESTION_RELIEF = ['congestion', 'rideshare','ridesharing', 'vanpool', 'car share']\n", - "\n", - " NOT_INC = ['charging', 'fueling', 'cng', 'bridge', 'trail',\n", - " 'k-rail', 'guardrails', 'bridge rail', 'guard', 'guarrail']\n", - " \n", - " PASSENGER_MODE = ['non sov', 'high quality transit areas', \n", - " 'hqta', 'hov']\n", - " \n", - " \n", - " SAFETY = ['fatalities','safe', 'speed management','signal coordination',\n", - " 'slow speeds', 'roundabouts', 'victims','collisoins','protect',\n", - " 'crash', 'modification factors', 'safety system'] \n", - " \n", - " def categorize_project_descriptions(row):\n", - " \"\"\"\n", - " This function takes a individual type of work description (row of a dataframe)\n", - " and returns a dummy flag of 1 if it finds keyword present in\n", - " project categories (active transportation, transit, bridge, etc).\n", - " A description can contain multiple keywords across categories.\n", - " \"\"\"\n", - " # Clean up project description 2\n", - " project_description = (row.project_description2.lower()\n", - " .replace(\"-\",\"\")\n", - " .replace(\".\",\"\")\n", - " .replace(\":\",\"\")\n", - " )\n", - " \n", - " # Store a bunch of columns that will be flagged\n", - " # A project can involve multiple things...also, not sure what's in the descriptions\n", - " active_transp = \"\"\n", - " transit = \"\"\n", - " bridge =\"\"\n", - " street = \"\"\n", - " freeway = \"\"\n", - " infra_resiliency_er = \"\"\n", - " congestion_relief = \"\"\n", - " passenger_mode_shift = \"\"\n", - " safety = \"\"\n", - " \n", - " if any(word in project_description for word in ACTIVE_TRANSPORTATION):\n", - " active_transp = \"active transportation\"\n", - " \n", - " #if any(word in description if instanceof(word, str) else word(description) for word in TRANSIT)\n", - "\n", - " if (any(word in project_description for word in TRANSIT) and \n", - " not any(exclude_word in project_description for exclude_word in NOT_INC)\n", - " ):\n", - " transit = \"transit\"\n", - " if any(word in project_description for word in BRIDGE):\n", - " bridge = \"bridge\"\n", - " if any(word in project_description for word in STREET):\n", - " street = \"street\"\n", - " if any(word in project_description for word in FREEWAY):\n", - " freeway = \"freeway\" \n", - " if any(word in project_description for word in INFRA_RESILIENCY_ER):\n", - " infra_resiliency_er = \"infrastructure\"\n", - " if any(word in project_description for word in CONGESTION_RELIEF):\n", - " congestion_relief = \"congestion relief\" \n", - " if any(word in project_description for word in PASSENGER_MODE):\n", - " passenger_mode_shift = \"passenger mode shift\" \n", - " if any(word in project_description for word in SAFETY):\n", - " safety = \"safety\" \n", - " return pd.Series(\n", - " [active_transp, transit, bridge, street, freeway, infra_resiliency_er, congestion_relief,\n", - " passenger_mode_shift, safety], \n", - " index=['active_transp', 'transit', 'bridge', 'street', \n", - " 'freeway', 'infra_resiliency_er', 'congestion_relief',\n", - " 'passenger_mode_shift', 'safety']\n", - " )\n", - " \n", - " \n", - " work_categories = df.apply(categorize_project_descriptions, axis=1)\n", - " work_cols = list(work_categories.columns)\n", - " df2 = pd.concat([df, work_categories], axis=1)\n", - " \n", - " df2['categories'] = df2[work_cols].agg(' '.join, axis=1)\n", - " df2['categories'] = df2['categories'].str.strip()\n", - " df2 = df2.drop(columns = work_cols)\n", - " \n", + "def explode_dataframe(df:pd.DataFrame, column_to_explode:str)-> pd.DataFrame:\n", + " df['Column2'] = df[column_to_explode].apply(lambda x: [int(i) if i.isdigit() else i for i in x.replace(',', '').split()])\n", + " df = df.drop(columns = [column_to_explode])\n", + " df2 = df.explode('Column2')\n", + " df2 = df2.rename(columns = {'Column2': column_to_explode})\n", " return df2" ] }, { "cell_type": "code", - "execution_count": 66, - "id": "6ea11daa-3a18-4d8a-9004-b2fc5e6d4343", + "execution_count": 49, + "id": "ea4fd105-9188-426f-a724-5de40ae5af4b", "metadata": {}, "outputs": [], "source": [ - "all_projects_metric = add_categories(all_projects)" + "sb1_assembly = separate_out_df(sb1_df, ['project_number', 'assemblydistrict'])" ] }, { "cell_type": "code", - "execution_count": 67, - "id": "f3856c74-228d-4cf8-929a-cac486024586", + "execution_count": 50, + "id": "6b621482-fb52-441f-9fde-bb021dda35c5", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
project_titlelead_agencyproject_yearproject_categorygrant_programphaseproject_descriptiontotal_project_cost_(millions)total_available_funds_(millions)unfunded_needs_(millions)citycountylocationdata_sourcenotesfunding_notesct_districtproject_description2fully_fundedenough_infocountscategories
1886Safety - Hm4Caltrans21/22NoneHmIn ProgressMaintain/Repair Transportaiton Management Systems0.200.000.20VisaliaTulareNoneSb1Iijaprogram: None, Dateupdated: 2022-09-19, Isonshs: None, Isonshscodes: N, Agencies: Caltrans, Popup:Sb1Funds: 0.0, Iijafunds: 0.006Maintain/Repair Transportaiton Management SystemsNo available funding infoYesnumber of strings in project desc: 5 % of null values:22infrastructure
1119Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 SNoneNoneFreeway Safety And Congestion Relief ProgramSacramento A2 2004NoneNone47.000.0047.00NoneSacramentoNoneLocal Options Sales TaxNotes: No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category.Estimated Lost Funds: 0.0, Estimated Federal Funds: 0.0, Estimated State Funds: 0.0, Estimated Local Funds: 0, Estimated Other Funds: 0.0NoneBus/Carpool Ramp Connection From Sr 50 E To Sr 99 SNo available funding infoNonumber of strings in project desc: 1 % of null values:40transit
1589Highway 101: Betteravia Road InterchangeNoneNoneNoneSanta Barbara A 2008NoneImprove The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.2.005.00-3.00NoneSanta BarbaraNoneLocal Options Sales TaxNotes: NanEstimated Lost Funds: 2.0, Estimated Federal Funds: 0.0, Estimated State Funds: 0.0, Estimated Local Funds: 0, Estimated Other Funds: 3.0NoneImprove The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant.Fully fundedNonumber of strings in project desc: 24 % of null values:36street freeway infrastructure
\n", - "
" - ], - "text/plain": [ - " project_title lead_agency \\\n", - "1886 Safety - Hm4 Caltrans \n", - "1119 Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S None \n", - "1589 Highway 101: Betteravia Road Interchange None \n", - "\n", - " project_year project_category \\\n", - "1886 21/22 None \n", - "1119 None Freeway Safety And Congestion Relief Program \n", - "1589 None None \n", - "\n", - " grant_program phase \\\n", - "1886 Hm In Progress \n", - "1119 Sacramento A2 2004 None \n", - "1589 Santa Barbara A 2008 None \n", - "\n", - " project_description \\\n", - "1886 Maintain/Repair Transportaiton Management Systems \n", - "1119 None \n", - "1589 Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant. \n", - "\n", - " total_project_cost_(millions) total_available_funds_(millions) \\\n", - "1886 0.20 0.00 \n", - "1119 47.00 0.00 \n", - "1589 2.00 5.00 \n", - "\n", - " unfunded_needs_(millions) city county location \\\n", - "1886 0.20 Visalia Tulare None \n", - "1119 47.00 None Sacramento None \n", - "1589 -3.00 None Santa Barbara None \n", - "\n", - " data_source \\\n", - "1886 Sb1 \n", - "1119 Local Options Sales Tax \n", - "1589 Local Options Sales Tax \n", - "\n", - " notes \\\n", - "1886 Iijaprogram: None, Dateupdated: 2022-09-19, Isonshs: None, Isonshscodes: N, Agencies: Caltrans, Popup: \n", - "1119 Notes: No Specific Amounts For Each Project. Divided Total Fund Slated For A Project Category By Number Of Projects In That Category. \n", - "1589 Notes: Nan \n", - "\n", - " funding_notes \\\n", - "1886 Sb1Funds: 0.0, Iijafunds: 0.0 \n", - "1119 Estimated Lost Funds: 0.0, Estimated Federal Funds: 0.0, Estimated State Funds: 0.0, Estimated Local Funds: 0, Estimated Other Funds: 0.0 \n", - "1589 Estimated Lost Funds: 2.0, Estimated Federal Funds: 0.0, Estimated State Funds: 0.0, Estimated Local Funds: 0, Estimated Other Funds: 3.0 \n", - "\n", - " ct_district \\\n", - "1886 06 \n", - "1119 None \n", - "1589 None \n", - "\n", - " project_description2 \\\n", - "1886 Maintain/Repair Transportaiton Management Systems \n", - "1119 Bus/Carpool Ramp Connection From Sr 50 E To Sr 99 S \n", - "1589 Improve The Operations Of Intersections At Betteravia Road And Highway 101 By Constructionructioning A\\nNorthbound Loop On Ramp In The South East Interchange Quadrant. \n", - "\n", - " fully_funded enough_info \\\n", - "1886 No available funding info Yes \n", - "1119 No available funding info No \n", - "1589 Fully funded No \n", - "\n", - " counts \\\n", - "1886 number of strings in project desc: 5 % of null values:22 \n", - "1119 number of strings in project desc: 1 % of null values:40 \n", - "1589 number of strings in project desc: 24 % of null values:36 \n", - "\n", - " categories \n", - "1886 infrastructure \n", - "1119 transit \n", - "1589 street freeway infrastructure " - ] - }, - "execution_count": 67, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "all_projects_metric.drop(columns = ['geometry']).sample(3)" + "sb1_assembly = explode_dataframe(sb1_assembly, 'assemblydistrict')" ] }, { "cell_type": "code", - "execution_count": 69, - "id": "ad99b589-1d78-4052-96ac-4617f0494544", + "execution_count": 51, + "id": "a0e95618-ea4f-4b7d-b1fc-954a780dd86f", "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "infrastructure 1436\n", - " 1381\n", - "street infrastructure 739\n", - "street 372\n", - "bridge infrastructure 226\n", - "transit infrastructure 201\n", - "active transportation street infrastructure 106\n", - "transit 75\n", - "street infrastructure safety 58\n", - "transit street infrastructure 52\n", - "freeway infrastructure 52\n", - "bridge street infrastructure 45\n", - "bridge 44\n", - "active transportation infrastructure 44\n", - "active transportation 42\n", - "street freeway infrastructure safety 41\n", - "street freeway infrastructure 37\n", - "infrastructure safety 36\n", - "active transportation street infrastructure safety 29\n", - "freeway infrastructure passenger mode shift 22\n", - "active transportation transit infrastructure 21\n", - "freeway 20\n", - "freeway infrastructure safety 18\n", - "active transportation transit street infrastructure 17\n", - "bridge street infrastructure safety 14\n", - "bridge infrastructure safety 12\n", - "street infrastructure congestion relief 11\n", - "active transportation transit street infrastructure safety 11\n", - "passenger mode shift 11\n", - "street safety 10\n", - "Name: categories, dtype: int64" - ] - }, - "execution_count": 69, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ - "all_projects_metric.categories.value_counts().head(30)" + "sb1_senate = separate_out_df(sb1_df, ['project_number', 'senatedistrict'])" ] }, { "cell_type": "code", - "execution_count": 88, - "id": "270e8b35-cc6b-4461-835c-40c4b850916d", + "execution_count": 52, + "id": "1d67d259-8822-4512-877b-bc1b66de7d96", "metadata": {}, "outputs": [], "source": [ - "def apply_metrics(df):\n", - " def categorize_metrics(row):\n", - " categories = row.categories.lower()\n", - " safety = \"\"\n", - " passenger_mode_shift = \"\"\n", - " infill_development = \"\"\n", - " \n", - " if any(word in categories for word in ['infrastructure','safety',]):\n", - " safety = \"safety\"\n", - " if any(word in categories for word in ['active transportation', 'passenger_mode_shift', \"congestion relief\"]):\n", - " passenger_mode_shift = \"passenger_mode_shift\"\n", - " if any(word in categories for word in ['transit', 'active transportation',]):\n", - " infill_development = \"infill_development\" \n", - " \n", - " return pd.Series(\n", - " [safety,passenger_mode_shift,infill_development], \n", - " index=['safety', 'passenger_mode_shift', 'infill_development']\n", - " )\n", - " \n", - " work_categories = df.apply(categorize_metrics, axis=1)\n", - " work_cols = list(work_categories.columns)\n", - " df2 = pd.concat([df, work_categories], axis=1)\n", - " \n", - " df2['applicable_metrics'] = df2[work_cols].agg(' '.join, axis=1)\n", - " df2['applicable_metrics'] = df2['applicable_metrics'].str.strip()\n", - " df2 = df2.drop(columns = work_cols)\n", - " \n", - " return df2" + "sb1_senate = explode_dataframe(sb1_senate, 'senatedistrict')" ] }, { "cell_type": "code", - "execution_count": 89, - "id": "9a643de4-b6b3-4751-9a9f-b68abe4d7a22", + "execution_count": 53, + "id": "137fda15-a412-49fe-be74-48d58f750bb3", "metadata": {}, "outputs": [], "source": [ - "all_projects_metric = apply_metrics(all_projects_metric)" + "sb1_city = separate_out_df(sb1_df, ['project_number', 'cityname'])" ] }, { "cell_type": "code", - "execution_count": 90, - "id": "a6da3b49-dd3f-4b01-b394-23f44bf8e3a6", - "metadata": { - "scrolled": true, - "tags": [] - }, + "execution_count": 56, + "id": "d98c2dc2-3232-4fe0-ba3f-6f2d148fb755", + "metadata": {}, + "outputs": [], + "source": [ + "sb1_geo = separate_out_df(sb1_df, ['project_number', 'geometry'])" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "67e7d80d-ee5c-4c1b-8337-bb30be56f585", + "metadata": {}, + "outputs": [], + "source": [ + "sb1_awards = separate_out_df(sb1_df, ['project_number', 'projprogram'])" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "ab21d511-2062-4c89-ad45-dfdc95721cc1", + "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
grant_programproject_description2categoriesapplicable_metrics
2587ShoppA $4.91 Million Dollar Project In Santa Barbara County On Route 154 Will Place High Friction Surface Treatment (Hfst) And Construct Centerline Rumble Strip.infrastructuresafety
1058LsrNone
17State Rail PlanExpansion Of The Smart Fleet To Accommodate Service Expansion.infrastructuresafety
845Imperial D 2008Overlayinfrastructuresafety
1933HmMaintain/Repair Pavement - Seal Coatstreet infrastructuresafety
2032ShoppA $11.57 Million Dollar Project In Humboldt County On Route 299 Will Widen Shoulders.infrastructuresafety
331SgrNone
3222ShoppA $5.8 Million Dollar Project In San Diego County On Route 5 Will Apply Polyester Concrete Overlay To Bridge Decks, Apply Methacrylate To Approach Slabs, And Repair Spalls. (Bridge Deck Preservation)bridge infrastructuresafety
106State Rail PlanDouble Track From Mp 436.65 To Cp Santa Susana To Allow At-Speed Meets At 437.4. Add 2Nd Platform At Simi Valley Station To Allow Boarding From Both Tracks.transitinfill_development
1092LsrNone
1703StipNear The City Of Tulare, On Route 65 From Lindsay To Exeter, And On Road 204 From Route 137 To Route 198. Widen To 4 Divided Lanes And Realign Highway.street freeway infrastructuresafety
753Imperial D 2008Overlayinfrastructuresafety
43State Rail PlanCaltrain Electrification Will Electrify The Corridor From San Francisco Caltrain Station To The Tamien Caltrain Station. Electrification Improvements Include Converting Diesel-Hauled Trains To Electric Trains, Increasing Service To Six Trains Per Peak Hour Per Direction, And Maintaining Operating Speed Up To 79 Mph.transit infrastructuresafety infill_development
627HmMaintain/Repair Transportaiton Management Systemsinfrastructuresafety
900Los Angeles Angeles M 2016Transportation System And Mobility Improve Program
2016ShoppA $7.4 Million Dollar Project In Mendocino County On Route 1 Will Widen For Standard Shoulders, Improve Roadway Cross Slope, And Install Rumble Strips And Guardrail.street infrastructuresafety
1759HmRepair/Replace Culvertsinfrastructuresafety
2082ShoppA $24.63 Million Dollar Project In Mendocino County On Route 128 Will Rehabilitate Pavement, Upgrade Transportation Management System (Tms) Elements, Guardrails, And Sign Panels, Upgrade Facilities To Americans With Disabilities Act (Ada) Standards, And Construct Complete Streets Elements.street infrastructuresafety
1488ShoppA $6.28 Million Dollar Project In San Bernardino County On Route Var Will Upgrade Traffic Census Stations.transit infrastructuresafety infill_development
1886HmMaintain/Repair Transportaiton Management Systemsinfrastructuresafety
311Imperial D 2008Reconstructioninfrastructuresafety
1335ShoppA $4.92 Million Dollar Project In San Mateo County On Route 82 Will Upgrade Bridge Rails And Signals And Upgrade Facilities To Americans With Disabilities Act (Ada) Standards.bridge street infrastructuresafety
487HmMaintain/Repair Maintenance Stationtransit infrastructuresafety infill_development
471SgrNone
901Los Angeles Angeles M 2016Active Transportation 1St/Last Mile Connections Progactive transportationpassenger_mode_shift infill_development
456Imperial D 2008Lincoln Ave From Rose Ave To Weakley St S
1140LsrNone
3275ShoppA $36.09 Million Dollar Project In Orange County On Route 22 Will Install Safety Lighting And Upgrade Median Barrier, Drainage Systems, And Signs.street infrastructure safetysafety
3176ShoppA $4.53 Million Dollar Project In San Diego County On Route 94 Will Construct And Upgrade Pedestrian Curb Ramps And Sidewalks To Meet Current Standards.active transportation streetpassenger_mode_shift infill_development
1540San Mateo W 2018Pedestrian Accessibility Improvements Citywideactive transportation infrastructuresafety passenger_mode_shift infill_development
1318San Diego A 20048F+2Hovpassenger mode shift
3135ShoppA $2.87 Million Dollar Project In San Joaquin County On Route 99 Will Apply High Friction Surface Treatment (Hfst).
320Imperial D 2008Overlayinfrastructuresafety
1633TcepIn San Bernardino And Riverside Counties Through The Cities Of Eastvale, Jurupa Valley, Ontario, And Rancho Cucamonga, On I-15 From Cantu Galleano Road To Foothill Boulevard. This Project Will Construct The Toll System Needed To Operate The I-15 Express Lanes Project (08-0167M).street freeway infrastructuresafety
1376San Joaquin K 2003Widen From 2 To 4 Lanes Between I-5 And The Lodi City Limits
3036ShoppA $4.57 Million Dollar Project In Riverside County On Route 60 Will Reduce Wrong-Way Collisions By Installing Wrong-Way Pavement Markers And Sign Panels, And Upgrading Pavement Markings At Onramps And Offramps.street infrastructuresafety
826Imperial D 2008Reconstructinfrastructuresafety
340Imperial D 2008Crack Seal/Slurry Coatstreet
2405ShoppA $7.14 Million Dollar Project In Santa Clara County On Route 152 Will Rehabilitate Drainage Systems, Upgrade Guardrail, And Pave Roadside Areas To Prevent Vegetation Growth And Enhance Highway Worker Safety.street freeway infrastructure safetysafety
814Imperial D 2008Overlayinfrastructuresafety
44TircpNone
1041LsrNone
134State Rail PlanDesign And Construct A New Station And Platform In The Oakley Civic Center On The San Joaquins Route Between Oakland And Stockton. This Station Is Five Miles From The Existing Antioch/Pitsburg Station And Will Serve The Communities Of Oakley And Brentwood.transit infrastructuresafety infill_development
2009ShoppA $104.39 Million Dollar Project In Del Norte County On Route 101 Will Replace Bridge.bridge infrastructuresafety
12Alameda B 2000I-580 Interchange Improvements In Castro Valleyinfrastructuresafety
2107ShoppA $9.16 Million Dollar Project In Mendocino County On Route 020 Will Safety Improvements.infrastructure safetysafety
1371ShoppA $9.7 Million Dollar Project In Santa Barbara County On Route 154 Will Replace Bridge.bridge infrastructuresafety
973Los Angeles Angeles R 2008Bus Operations (Countywide Bus Service Operations,\\nMaintenance, And Expansion. Suspend A Scheduled\\nJuly 1, 2009 Metro Fare Increase For One Year And\\nFreeze All Metro Student, Senior, Disabled, And\\nMedicare Fares Through June 30, 2013 By Instead\\nUsing Metro'S Formula Allocation Procedure Share Of\\nThis Subfund.)transit infrastructuresafety infill_development
1823Tulare R 2006Over Crossinginfrastructuresafety
97State Rail PlanDouble Track Between Cp Canyon (Newhall Siding) And Cp Hood (Canyon) To Allow 15- Minute Service Between La And Santa Clarita.infrastructuresafety
\n", - "
" - ], "text/plain": [ - " grant_program \\\n", - "2587 Shopp \n", - "1058 Lsr \n", - "17 State Rail Plan \n", - "845 Imperial D 2008 \n", - "1933 Hm \n", - "2032 Shopp \n", - "331 Sgr \n", - "3222 Shopp \n", - "106 State Rail Plan \n", - "1092 Lsr \n", - "1703 Stip \n", - "753 Imperial D 2008 \n", - "43 State Rail Plan \n", - "627 Hm \n", - "900 Los Angeles Angeles M 2016 \n", - "2016 Shopp \n", - "1759 Hm \n", - "2082 Shopp \n", - "1488 Shopp \n", - "1886 Hm \n", - "311 Imperial D 2008 \n", - "1335 Shopp \n", - "487 Hm \n", - "471 Sgr \n", - "901 Los Angeles Angeles M 2016 \n", - "456 Imperial D 2008 \n", - "1140 Lsr \n", - "3275 Shopp \n", - "3176 Shopp \n", - "1540 San Mateo W 2018 \n", - "1318 San Diego A 2004 \n", - "3135 Shopp \n", - "320 Imperial D 2008 \n", - "1633 Tcep \n", - "1376 San Joaquin K 2003 \n", - "3036 Shopp \n", - "826 Imperial D 2008 \n", - "340 Imperial D 2008 \n", - "2405 Shopp \n", - "814 Imperial D 2008 \n", - "44 Tircp \n", - "1041 Lsr \n", - "134 State Rail Plan \n", - "2009 Shopp \n", - "12 Alameda B 2000 \n", - "2107 Shopp \n", - "1371 Shopp \n", - "973 Los Angeles Angeles R 2008 \n", - "1823 Tulare R 2006 \n", - "97 State Rail Plan \n", - "\n", - " project_description2 \\\n", - "2587 A $4.91 Million Dollar Project In Santa Barbara County On Route 154 Will Place High Friction Surface Treatment (Hfst) And Construct Centerline Rumble Strip. \n", - "1058 None \n", - "17 Expansion Of The Smart Fleet To Accommodate Service Expansion. \n", - "845 Overlay \n", - "1933 Maintain/Repair Pavement - Seal Coat \n", - "2032 A $11.57 Million Dollar Project In Humboldt County On Route 299 Will Widen Shoulders. \n", - "331 None \n", - "3222 A $5.8 Million Dollar Project In San Diego County On Route 5 Will Apply Polyester Concrete Overlay To Bridge Decks, Apply Methacrylate To Approach Slabs, And Repair Spalls. (Bridge Deck Preservation) \n", - "106 Double Track From Mp 436.65 To Cp Santa Susana To Allow At-Speed Meets At 437.4. Add 2Nd Platform At Simi Valley Station To Allow Boarding From Both Tracks. \n", - "1092 None \n", - "1703 Near The City Of Tulare, On Route 65 From Lindsay To Exeter, And On Road 204 From Route 137 To Route 198. Widen To 4 Divided Lanes And Realign Highway. \n", - "753 Overlay \n", - "43 Caltrain Electrification Will Electrify The Corridor From San Francisco Caltrain Station To The Tamien Caltrain Station. Electrification Improvements Include Converting Diesel-Hauled Trains To Electric Trains, Increasing Service To Six Trains Per Peak Hour Per Direction, And Maintaining Operating Speed Up To 79 Mph. \n", - "627 Maintain/Repair Transportaiton Management Systems \n", - "900 Transportation System And Mobility Improve Program \n", - "2016 A $7.4 Million Dollar Project In Mendocino County On Route 1 Will Widen For Standard Shoulders, Improve Roadway Cross Slope, And Install Rumble Strips And Guardrail. \n", - "1759 Repair/Replace Culverts \n", - "2082 A $24.63 Million Dollar Project In Mendocino County On Route 128 Will Rehabilitate Pavement, Upgrade Transportation Management System (Tms) Elements, Guardrails, And Sign Panels, Upgrade Facilities To Americans With Disabilities Act (Ada) Standards, And Construct Complete Streets Elements. \n", - "1488 A $6.28 Million Dollar Project In San Bernardino County On Route Var Will Upgrade Traffic Census Stations. \n", - "1886 Maintain/Repair Transportaiton Management Systems \n", - "311 Reconstruction \n", - "1335 A $4.92 Million Dollar Project In San Mateo County On Route 82 Will Upgrade Bridge Rails And Signals And Upgrade Facilities To Americans With Disabilities Act (Ada) Standards. \n", - "487 Maintain/Repair Maintenance Station \n", - "471 None \n", - "901 Active Transportation 1St/Last Mile Connections Prog \n", - "456 Lincoln Ave From Rose Ave To Weakley St S \n", - "1140 None \n", - "3275 A $36.09 Million Dollar Project In Orange County On Route 22 Will Install Safety Lighting And Upgrade Median Barrier, Drainage Systems, And Signs. \n", - "3176 A $4.53 Million Dollar Project In San Diego County On Route 94 Will Construct And Upgrade Pedestrian Curb Ramps And Sidewalks To Meet Current Standards. \n", - "1540 Pedestrian Accessibility Improvements Citywide \n", - "1318 8F+2Hov \n", - "3135 A $2.87 Million Dollar Project In San Joaquin County On Route 99 Will Apply High Friction Surface Treatment (Hfst). \n", - "320 Overlay \n", - "1633 In San Bernardino And Riverside Counties Through The Cities Of Eastvale, Jurupa Valley, Ontario, And Rancho Cucamonga, On I-15 From Cantu Galleano Road To Foothill Boulevard. This Project Will Construct The Toll System Needed To Operate The I-15 Express Lanes Project (08-0167M). \n", - "1376 Widen From 2 To 4 Lanes Between I-5 And The Lodi City Limits \n", - "3036 A $4.57 Million Dollar Project In Riverside County On Route 60 Will Reduce Wrong-Way Collisions By Installing Wrong-Way Pavement Markers And Sign Panels, And Upgrading Pavement Markings At Onramps And Offramps. \n", - "826 Reconstruct \n", - "340 Crack Seal/Slurry Coat \n", - "2405 A $7.14 Million Dollar Project In Santa Clara County On Route 152 Will Rehabilitate Drainage Systems, Upgrade Guardrail, And Pave Roadside Areas To Prevent Vegetation Growth And Enhance Highway Worker Safety. \n", - "814 Overlay \n", - "44 None \n", - "1041 None \n", - "134 Design And Construct A New Station And Platform In The Oakley Civic Center On The San Joaquins Route Between Oakland And Stockton. This Station Is Five Miles From The Existing Antioch/Pitsburg Station And Will Serve The Communities Of Oakley And Brentwood. \n", - "2009 A $104.39 Million Dollar Project In Del Norte County On Route 101 Will Replace Bridge. \n", - "12 I-580 Interchange Improvements In Castro Valley \n", - "2107 A $9.16 Million Dollar Project In Mendocino County On Route 020 Will Safety Improvements. \n", - "1371 A $9.7 Million Dollar Project In Santa Barbara County On Route 154 Will Replace Bridge. \n", - "973 Bus Operations (Countywide Bus Service Operations,\\nMaintenance, And Expansion. Suspend A Scheduled\\nJuly 1, 2009 Metro Fare Increase For One Year And\\nFreeze All Metro Student, Senior, Disabled, And\\nMedicare Fares Through June 30, 2013 By Instead\\nUsing Metro'S Formula Allocation Procedure Share Of\\nThis Subfund.) \n", - "1823 Over Crossing \n", - "97 Double Track Between Cp Canyon (Newhall Siding) And Cp Hood (Canyon) To Allow 15- Minute Service Between La And Santa Clarita. \n", - "\n", - " categories \\\n", - "2587 infrastructure \n", - "1058 \n", - "17 infrastructure \n", - "845 infrastructure \n", - "1933 street infrastructure \n", - "2032 infrastructure \n", - "331 \n", - "3222 bridge infrastructure \n", - "106 transit \n", - "1092 \n", - "1703 street freeway infrastructure \n", - "753 infrastructure \n", - "43 transit infrastructure \n", - "627 infrastructure \n", - "900 \n", - "2016 street infrastructure \n", - "1759 infrastructure \n", - "2082 street infrastructure \n", - "1488 transit infrastructure \n", - "1886 infrastructure \n", - "311 infrastructure \n", - "1335 bridge street infrastructure \n", - "487 transit infrastructure \n", - "471 \n", - "901 active transportation \n", - "456 \n", - "1140 \n", - "3275 street infrastructure safety \n", - "3176 active transportation street \n", - "1540 active transportation infrastructure \n", - "1318 passenger mode shift \n", - "3135 \n", - "320 infrastructure \n", - "1633 street freeway infrastructure \n", - "1376 \n", - "3036 street infrastructure \n", - "826 infrastructure \n", - "340 street \n", - "2405 street freeway infrastructure safety \n", - "814 infrastructure \n", - "44 \n", - "1041 \n", - "134 transit infrastructure \n", - "2009 bridge infrastructure \n", - "12 infrastructure \n", - "2107 infrastructure safety \n", - "1371 bridge infrastructure \n", - "973 transit infrastructure \n", - "1823 infrastructure \n", - "97 infrastructure \n", - "\n", - " applicable_metrics \n", - "2587 safety \n", - "1058 \n", - "17 safety \n", - "845 safety \n", - "1933 safety \n", - "2032 safety \n", - "331 \n", - "3222 safety \n", - "106 infill_development \n", - "1092 \n", - "1703 safety \n", - "753 safety \n", - "43 safety infill_development \n", - "627 safety \n", - "900 \n", - "2016 safety \n", - "1759 safety \n", - "2082 safety \n", - "1488 safety infill_development \n", - "1886 safety \n", - "311 safety \n", - "1335 safety \n", - "487 safety infill_development \n", - "471 \n", - "901 passenger_mode_shift infill_development \n", - "456 \n", - "1140 \n", - "3275 safety \n", - "3176 passenger_mode_shift infill_development \n", - "1540 safety passenger_mode_shift infill_development \n", - "1318 \n", - "3135 \n", - "320 safety \n", - "1633 safety \n", - "1376 \n", - "3036 safety \n", - "826 safety \n", - "340 \n", - "2405 safety \n", - "814 safety \n", - "44 \n", - "1041 \n", - "134 safety infill_development \n", - "2009 safety \n", - "12 safety \n", - "2107 safety \n", - "1371 safety \n", - "973 safety infill_development \n", - "1823 safety \n", - "97 safety " + "((9186, 2), (1585, 2), (6696, 2))" ] }, - "execution_count": 90, + "execution_count": 58, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_projects_metric[['grant_program','project_description2','categories','applicable_metrics']].sample(50)" + "sb1_awards.shape, sb1_geo.shape, sb1_city.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "d09e706e-cf16-4434-8b4b-b2661f0ba742", + "metadata": {}, + "outputs": [], + "source": [ + "sb1_agencies = separate_out_df(sb1_df, ['project_number', 'projagency', 'appagencyname', 'impagencyname'])" ] }, { "cell_type": "code", - "execution_count": 86, - "id": "9ea21916-dd50-4396-850b-87ea2535c9f4", + "execution_count": 60, + "id": "9a07f377-dc3d-402c-8c26-2f7def5abb41", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "7" + "(9186, 4)" ] }, - "execution_count": 86, + "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "all_projects_metric.applicable_metrics.nunique()" + "sb1_agencies.shape" ] }, { "cell_type": "markdown", - "id": "1ad5b6ae-9407-46ae-b2ff-c9ad6cbea83c", + "id": "1150a152-1432-40b9-a946-6a79964bf720", "metadata": {}, "source": [ - "### Categorization" + "### LP2000" ] }, { "cell_type": "code", - "execution_count": 43, - "id": "f9a86d28-9b77-48c3-ba6d-27dc360f2fd0", + "execution_count": 79, + "id": "b1c94570-cb6c-4e47-91c9-59caf8512dc7", "metadata": {}, "outputs": [], "source": [ - "def get_list_of_words(df, col: str) -> list:\n", - " \"\"\"\n", - " Natalie's function to clean and place words in a project description column\n", - " into a list\n", - " \"\"\"\n", - " # get just the one col\n", - " column = df[[col]]\n", - "\n", - " # remove single-dimensional entries from the shape of an array\n", - " col_text = column.squeeze()\n", - " # get list of words\n", - " text_list = col_text.tolist()\n", - "\n", - " # Join all the column into one large text blob, lower text\n", - " text_list = \" \".join(text_list).lower()\n", - "\n", - " # remove punctuation\n", - " text_list = re.sub(r\"[^\\w\\s]\", \"\", text_list)\n", + "def load_lp2000(file: str):\n", + " \n", + " df_project = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"project\")\n", + " )\n", "\n", - " # List of stopwords\n", - " swords = [re.sub(r\"[^A-z\\s]\", \"\", sword) for sword in stopwords.words(\"english\")]\n", + " df_county = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"county\")\n", + " ).drop(columns=[\"project_label_name\"])\n", "\n", - " # Remove stopwords\n", - " clean_text_list = [\n", - " word for word in word_tokenize(text_list.lower()) if word not in swords\n", - " ]\n", + " df_district = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"district\")\n", + " ).drop(columns=[\"project_label_name\"])\n", "\n", - " return clean_text_list" + " df_award = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"awards\")\n", + " )\n", + " \n", + " df_phase = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"phase_funding\")\n", + " )\n", + " \n", + " return df_project, df_county, df_district, df_award, df_phase" ] }, { "cell_type": "code", - "execution_count": 44, - "id": "cd602787-2444-49c5-8bb8-c59a63975de5", + "execution_count": 80, + "id": "91c989ec-208b-4053-8ae8-3e3a0e44c5c8", "metadata": {}, "outputs": [], "source": [ - "def find_common_phrases(df, description_column: str, values_to_add: list):\n", - "\n", - " # Break apart every word in the description column into a list\n", - " descriptions_list = get_list_of_words(df, description_column)\n", - "\n", - " # Get phrases of whatever length you want (2,3,4,etc)\n", - " c = Counter([\" \".join(y) for x in [2] for y in ngrams(descriptions_list, x)])\n", - "\n", - " # Make a dataframe out of the counter values\n", - " df_phrases = pd.DataFrame({\"phrases\": list(c.keys()), \"total\": list(c.values())})\n", - "\n", - " # Take phrases that are repeated more than 40 times and turn it into a list\n", - " df_phrases = ((df_phrases.loc[df_phrases[\"total\"] > 40])).reset_index(drop=True)\n", - " common_phrases_list = df_phrases.phrases.tolist()\n", - "\n", - " phrases_to_del = [\n", - " \"san bernardino\",\n", - " \"los angeles\",\n", - " \"contra costa\",\n", - " \"el dorado\",\n", - " \"san luis obispo\",\n", - " \"luis obispo\",\n", - " \"del norte\",\n", - " \"san francisco\",\n", - " \"improve approximately\",\n", - " ]\n", - "\n", - " common_phrases_list = list(set(common_phrases_list) - set(phrases_to_del))\n", - "\n", - " # CLean up the list to delete county information/etc\n", - " words_to_delete = [\n", - " \"county\",\n", - " \"route\",\n", - " \"dollar\",\n", - " \"mile\",\n", - " \"santa\",\n", - " \"project\",\n", - " \"san\",\n", - " \"lanes\",\n", - " \"lane\",\n", - " \"2\",\n", - " \"4\",\n", - " \"financial\",\n", - " \"prop\",\n", - " \"best\",\n", - " \"approximately\",\n", - " ]\n", - "\n", - " for word in words_to_delete:\n", - " common_phrases_list = [x for x in common_phrases_list if word not in x]\n", - "\n", - " # ADD certain keywords here\n", - " # Operating Additional Service\n", - " common_phrases_list.extend(values_to_add)\n", - "\n", - " return common_phrases_list" + "lp2000_project, lp2000_county, lp2000_district, lp2000_award, lp2000_phase = load_lp2000(\"LP2000_projects.xlsx\")" ] }, { "cell_type": "code", - "execution_count": 45, - "id": "ec139873-4bb7-4428-9fd7-ceb9e247d4a3", + "execution_count": 63, + "id": "f7947161-a519-4342-8672-edbdba742984", "metadata": {}, "outputs": [], "source": [ - "def categorize_projects(\n", - " df,\n", - " description_column: str,\n", - " project_id_column: str,\n", - " title_column: str,\n", - " values_to_add: list,\n", - "):\n", - "\n", - " # Find most common 2 word phrases for some automatic project categories\n", - " common_phrases_list = find_common_phrases(df, description_column, values_to_add)\n", - "\n", - " # Place all the words in common_phrases_list into a blob named query\n", - " # https://stackoverflow.com/questions/64727090/extract-all-matching-keywords-from-a-list-of-words-and-create-a-new-dataframe-pa\n", - " query = \"|\".join(common_phrases_list)\n", - "\n", - " # Remove punctation and lower strings in original description column befores searching\n", - " df[\"clean_description\"] = (\n", - " df[description_column]\n", - " .str.lower()\n", - " .str.replace(\"-\", \" \", regex=True)\n", - " .str.replace(\"(\", \" \", regex=True)\n", - " .str.replace(\")\", \" \", regex=True)\n", - " .str.replace(\".\", \" \", regex=True)\n", - " .str.strip()\n", - " )\n", - "\n", - " # Search through description column for the most common phrases\n", - " # Input the results in the new column\n", - " df[\"auto_project_category\"] = df[\"clean_description\"].str.findall(\n", - " r\"\\b({})\\b\".format(query)\n", - " )\n", - "\n", - " # Explode to take categories out of a list\n", - " # Drop duplicate project keywords by title\n", - " df = (\n", - " df.explode(\"auto_project_category\")\n", - " .sort_values([project_id_column, title_column])\n", - " .drop_duplicates(\n", - " subset=[\n", - " description_column,\n", - " project_id_column,\n", - " title_column,\n", - " \"auto_project_category\",\n", - " ]\n", - " )\n", - " )\n", - "\n", - " # Fill any uncategorized projects as \"Other\"\n", - " df[\"auto_project_category\"] = (\n", - " df[\"auto_project_category\"].fillna(\"Other\").str.title()\n", - " )\n", - "\n", - " # Correct spelling\n", - " spell = Speller(lang=\"en\")\n", - " df[\"auto_project_category\"] = df[\"auto_project_category\"].apply(\n", - " lambda x: \" \".join([spell(i) for i in x.split()])\n", - " )\n", - "\n", - " # Summarize - put all the categories onto one line\n", - " df = (\n", - " df.groupby(\n", - " [\n", - " description_column,\n", - " project_id_column,\n", - " title_column,\n", - " ]\n", - " )[\"auto_project_category\"]\n", - " .apply(\",\".join)\n", - " .reset_index()\n", - " )\n", - "\n", - " return df" + "# lp2000_project = unique_project_number(lp2000_project)" ] }, { "cell_type": "code", - "execution_count": 46, - "id": "d123f3b9-da23-4d4d-a2e2-dc3769100171", + "execution_count": 64, + "id": "4e674bed-ebd1-43eb-a944-721e30f22cfb", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(11272, 11272)" + ] + }, + "execution_count": 64, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "def add_all_projects2():\n", - "\n", - " # Load dataframes\n", - " state_rail_plan = harmonize_srp()\n", - " lost = harominze_lost()\n", - " sb1 = harmonize_sb1()\n", - "\n", - " # Concat for df\n", - " all_projects_df = pd.concat([lost, state_rail_plan, sb1])\n", - "\n", - " # Categorize\n", - " categories = categorize_projects(\n", - " all_projects_df,\n", - " \"project_description\",\n", - " \"project_title\",\n", - " \"project_id\",\n", - " [\n", - " \"operating\",\n", - " \"service\",\n", - " \"zero emission vehicle\",\n", - " \"zev\",\n", - " \"maintain/repair\",\n", - " \"repair/replace\",\n", - " ],\n", - " )\n", - "\n", - " # Merge categorized\n", - " all_projects_df = pd.merge(\n", - " all_projects_df.drop(columns=[\"clean_description\"]),\n", - " categories,\n", - " how=\"left\",\n", - " on=[\"project_description\", \"project_title\", \"project_id\"],\n", - " )\n", - "\n", - " # Rename\n", - " all_projects_df = all_projects_df.drop(columns=[\"auto_project_category_x\"]).rename(\n", - " columns={\"auto_project_category_y\": \"auto_tagged_project_categories\"}\n", - " )\n", - " # Concat for gdf\n", - " all_projects_gdf = pd.concat([sb1])\n", - " all_projects_gdf = all_projects_gdf.set_geometry(\"location\")\n", - "\n", - " return all_projects_df, all_projects_gdf" + "len(lp2000_project), lp2000_project.project_number.nunique()" ] }, { "cell_type": "code", - "execution_count": 47, - "id": "87a29e05-0ba6-40cb-93e2-d097159e6235", + "execution_count": 75, + "id": "219ec508-8706-49f0-ae04-711599d967a2", "metadata": {}, "outputs": [], "source": [ - "# all_projects, all_projects_geo = add_all_projects()" + "def add_project_number(df_with_project_name:pd.DataFrame, right_project:pd.DataFrame, merge_col:str)-> pd.DataFrame:\n", + " m1 = pd.merge(df_with_project_name, right_project, on = merge_col, how = 'inner')\n", + " return m1" ] }, { "cell_type": "code", - "execution_count": 48, - "id": "873a88ca-5a47-4bfe-a1d3-715a5bed05bb", - "metadata": { - "scrolled": true, - "tags": [] - }, + "execution_count": 71, + "id": "ad4f5fbd-918e-4c7d-9f11-74060ca6b9a2", + "metadata": {}, "outputs": [], "source": [ - "# all_projects.drop(columns = ['location'])[['project_title','project_category', 'auto_tagged_project_categories','project_description','total_available_funds','funding_notes']].sample(100)" + "lp2000_project_subset = lp2000_project[['project_number', 'project_id']]" ] }, { - "cell_type": "markdown", - "id": "85cfedf8-14aa-4d6c-b30e-cc9f6ee5bbf8", + "cell_type": "code", + "execution_count": 72, + "id": "64e0b2f1-2214-4822-b5d9-6c27a9be79ee", "metadata": {}, + "outputs": [], "source": [ - "### Look at the data" + "lp2000_county_df = pd.merge(lp2000_project_subset, lp2000_county, on = 'project_id', how = 'inner')" ] }, { "cell_type": "code", - "execution_count": 51, - "id": "a7e39b78-af8b-4bc5-8911-572839a72b36", - "metadata": { - "tags": [] - }, + "execution_count": 73, + "id": "633651e5-f043-4a8a-a4b4-9e5d42ac3803", + "metadata": {}, "outputs": [ { - "ename": "KeyError", - "evalue": "\"Column(s) ['project_id'] do not exist\"", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mKeyError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[51], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mall_projects\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgroupby\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlead_agency\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mproject_id\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mnunique\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241m.\u001b[39msort_values(\n\u001b[1;32m 2\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mproject_id\u001b[39m\u001b[38;5;124m\"\u001b[39m, ascending\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 3\u001b[0m )\u001b[38;5;241m.\u001b[39mhead(\u001b[38;5;241m10\u001b[39m)\n", - "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/groupby/generic.py:895\u001b[0m, in \u001b[0;36mDataFrameGroupBy.aggregate\u001b[0;34m(self, func, engine, engine_kwargs, *args, **kwargs)\u001b[0m\n\u001b[1;32m 892\u001b[0m func \u001b[38;5;241m=\u001b[39m maybe_mangle_lambdas(func)\n\u001b[1;32m 894\u001b[0m op \u001b[38;5;241m=\u001b[39m GroupByApply(\u001b[38;5;28mself\u001b[39m, func, args, kwargs)\n\u001b[0;32m--> 895\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[43mop\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 896\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_dict_like(func) \u001b[38;5;129;01mand\u001b[39;00m result \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 897\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n", - "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:172\u001b[0m, in \u001b[0;36mApply.agg\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mapply_str()\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m is_dict_like(arg):\n\u001b[0;32m--> 172\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43magg_dict_like\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m is_list_like(arg):\n\u001b[1;32m 174\u001b[0m \u001b[38;5;66;03m# we require a list, but not a 'str'\u001b[39;00m\n\u001b[1;32m 175\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39magg_list_like()\n", - "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:496\u001b[0m, in \u001b[0;36mApply.agg_dict_like\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 493\u001b[0m selected_obj \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_selected_obj\n\u001b[1;32m 494\u001b[0m selection \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_selection\n\u001b[0;32m--> 496\u001b[0m arg \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnormalize_dictlike_arg\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43magg\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mselected_obj\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43marg\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 498\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m selected_obj\u001b[38;5;241m.\u001b[39mndim \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m1\u001b[39m:\n\u001b[1;32m 499\u001b[0m \u001b[38;5;66;03m# key only used for output\u001b[39;00m\n\u001b[1;32m 500\u001b[0m colg \u001b[38;5;241m=\u001b[39m obj\u001b[38;5;241m.\u001b[39m_gotitem(selection, ndim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n", - "File \u001b[0;32m/opt/conda/lib/python3.9/site-packages/pandas/core/apply.py:619\u001b[0m, in \u001b[0;36mApply.normalize_dictlike_arg\u001b[0;34m(self, how, obj, func)\u001b[0m\n\u001b[1;32m 617\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mlen\u001b[39m(cols) \u001b[38;5;241m>\u001b[39m \u001b[38;5;241m0\u001b[39m:\n\u001b[1;32m 618\u001b[0m cols_sorted \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mlist\u001b[39m(safe_sort(\u001b[38;5;28mlist\u001b[39m(cols)))\n\u001b[0;32m--> 619\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mKeyError\u001b[39;00m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mColumn(s) \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mcols_sorted\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m do not exist\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 621\u001b[0m aggregator_types \u001b[38;5;241m=\u001b[39m (\u001b[38;5;28mlist\u001b[39m, \u001b[38;5;28mtuple\u001b[39m, \u001b[38;5;28mdict\u001b[39m)\n\u001b[1;32m 623\u001b[0m \u001b[38;5;66;03m# if we have a dict of any non-scalars\u001b[39;00m\n\u001b[1;32m 624\u001b[0m \u001b[38;5;66;03m# eg. {'A' : ['mean']}, normalize all to\u001b[39;00m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;66;03m# be list-likes\u001b[39;00m\n\u001b[1;32m 626\u001b[0m \u001b[38;5;66;03m# Cannot use func.values() because arg may be a Series\u001b[39;00m\n", - "\u001b[0;31mKeyError\u001b[0m: \"Column(s) ['project_id'] do not exist\"" - ] + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_numberproject_idcounty_name
43770b952b66e0205202(007)Los Angeles County
\n", + "
" + ], + "text/plain": [ + " project_number project_id county_name\n", + "4377 0b952b66e020 5202(007) Los Angeles County" + ] + }, + "execution_count": 73, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "all_projects.groupby([\"lead_agency\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n", - " \"project_id\", ascending=False\n", - ").head(10)" + "lp2000_county_df.sample()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "daa0b1d3-4416-4537-b568-bdaae9fd1fdb", + "execution_count": 74, + "id": "d45e35b7-604a-4103-ad2c-8e8ba3d8946a", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_idcomment_descest_total_prj_costslocation_nameproject_label_nameoriginal_post_mile_begin_idoriginal_post_mile_end_idrevised_post_mile_begin_indrevised_post_mile_end_indroute_namestate_hwy_indsenate_district_codeupdate_date_timeagency_nameurban_area_codework_type_desccategory_desccurrent_phaseactive_transportation_program__atp_bridge_inspection___scour_evaluationcovid_relief_funds_for_highway_infrastructure_programs_for_stip_covid_augmentationcarbon_reduction_program__crp_congestion_mitigation___air_quality_improvement_program__cmaq_coronavirus_response_and_relief_supplemental_appropriations_act__crrsaa__fundscorridor_mobility_improvement_account__cmia__programcounty_exchange_fundscounty_state_match_programearmarks_projects__hpp,_demo_cpfcds,_etc__emergency_relief__er_ferry_boat_program__fbp__and_ferry_boat_discretionary__fbd__programfunds_for_planning,_programming_and_monitoring___ripgeneral_funded_designated_programshazard_elimination_safety__hes_high_risk_rural_roads_program__hr3_highway_bridge_highway_safety_improvement_program__hsip___infrastructure__state_fundhighway_safety_improvement_program__hsip___non_infrastructure_highway_safety_improvement_program__hsip__infrastructure__federal_fundlocal_partnership_program__lpp_–_competitive__local_roadslocal_roads_rehabilitationrailroad_grade_crossing_protectionrailroad_grade_separationsrebuilding_american_infrastructure_with_sustainability_and_equity__raise__and_multimodal_project_discretionary_grant_programs__e_g_,_infra,_mega,_rstg_or_rural__regional_improvement_program_–_regional_share_of_stip_transportation_enhancement__off_system_regional_surface_transportation_block_grant_program__rstbgp__and_highway_infrastructure_program__hip_regional_transportation_planning_agency__rtpa__stp_match_exchangesb1_funded_freeway_service_patrolshopp__traffic_light_synchronization_program__tlsp___proposition_1b_bond_fundssafe_routes_to_school__sr2s_and_srts_set_aside_coordinated_border_infrastructure__cbi__program_under_fast_actsolutions_for_congested_corridors_program__sccp_special_programsstate_local_partnership_program__slpp__and_local_partnership_program__lpp_formulaic_structures_seismic_retrofit_trade_corridor_enhancement_account__tcea__programs_–_local_sharetrade_corridor_enhancement_account__tcea__programs_–_state_sharetrade_corridors_improvement_fund__tcif__program_local_streets___roadstraffic_congestion_relief_program___tcrp__project_number
43775202(007)file has been transferred to Saad issa on 12/14/05. Mike Benyamin\\n\\nInactive- msNaNVarious Locations CitywideTraffic Sign UpgradesNaNNaNNaNNaN0-SFRNNaN2010-08-09 20:04:27San Fernando3041.00NaNSignssingle phaseUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknownUnknown0b952b66e020
\n", + "
" + ], + "text/plain": [ + " project_id \\\n", + "4377 5202(007) \n", + "\n", + " comment_desc \\\n", + "4377 file has been transferred to Saad issa on 12/14/05. Mike Benyamin\\n\\nInactive- ms \n", + "\n", + " est_total_prj_costs location_name project_label_name \\\n", + "4377 NaN Various Locations Citywide Traffic Sign Upgrades \n", + "\n", + " original_post_mile_begin_id original_post_mile_end_id \\\n", + "4377 NaN NaN \n", + "\n", + " revised_post_mile_begin_ind revised_post_mile_end_ind route_name \\\n", + "4377 NaN NaN 0-SFR \n", + "\n", + " state_hwy_ind senate_district_code update_date_time agency_name \\\n", + "4377 N NaN 2010-08-09 20:04:27 San Fernando \n", + "\n", + " urban_area_code work_type_desc category_desc current_phase \\\n", + "4377 3041.00 NaN Signs single phase \n", + "\n", + " active_transportation_program__atp_ bridge_inspection___scour_evaluation \\\n", + "4377 Unknown Unknown \n", + "\n", + " covid_relief_funds_for_highway_infrastructure_programs_for_stip_covid_augmentation \\\n", + "4377 Unknown \n", + "\n", + " carbon_reduction_program__crp_ \\\n", + "4377 Unknown \n", + "\n", + " congestion_mitigation___air_quality_improvement_program__cmaq_ \\\n", + "4377 Unknown \n", + "\n", + " coronavirus_response_and_relief_supplemental_appropriations_act__crrsaa__funds \\\n", + "4377 Unknown \n", + "\n", + " corridor_mobility_improvement_account__cmia__program \\\n", + "4377 Unknown \n", + "\n", + " county_exchange_funds county_state_match_program \\\n", + "4377 Unknown Unknown \n", + "\n", + " earmarks_projects__hpp,_demo_cpfcds,_etc__ emergency_relief__er_ \\\n", + "4377 Unknown Unknown \n", + "\n", + " ferry_boat_program__fbp__and_ferry_boat_discretionary__fbd__program \\\n", + "4377 Unknown \n", + "\n", + " funds_for_planning,_programming_and_monitoring___rip \\\n", + "4377 Unknown \n", + "\n", + " general_funded_designated_programs hazard_elimination_safety__hes_ \\\n", + "4377 Unknown Unknown \n", + "\n", + " high_risk_rural_roads_program__hr3_ highway_bridge_ \\\n", + "4377 Unknown Unknown \n", + "\n", + " highway_safety_improvement_program__hsip___infrastructure__state_fund \\\n", + "4377 Unknown \n", + "\n", + " highway_safety_improvement_program__hsip___non_infrastructure_ \\\n", + "4377 Unknown \n", + "\n", + " highway_safety_improvement_program__hsip__infrastructure__federal_fund \\\n", + "4377 Unknown \n", + "\n", + " local_partnership_program__lpp_–_competitive__ local_roads \\\n", + "4377 Unknown Unknown \n", + "\n", + " local_roads_rehabilitation railroad_grade_crossing_protection \\\n", + "4377 Unknown Unknown \n", + "\n", + " railroad_grade_separations \\\n", + "4377 Unknown \n", + "\n", + " rebuilding_american_infrastructure_with_sustainability_and_equity__raise__and_multimodal_project_discretionary_grant_programs__e_g_,_infra,_mega,_rstg_or_rural__ \\\n", + "4377 Unknown \n", + "\n", + " regional_improvement_program_–_regional_share_of_stip_transportation_enhancement__off_system_ \\\n", + "4377 Unknown \n", + "\n", + " regional_surface_transportation_block_grant_program__rstbgp__and_highway_infrastructure_program__hip_ \\\n", + "4377 Unknown \n", + "\n", + " regional_transportation_planning_agency__rtpa__stp_match_exchange \\\n", + "4377 Unknown \n", + "\n", + " sb1_funded_freeway_service_patrol \\\n", + "4377 Unknown \n", + "\n", + " shopp__traffic_light_synchronization_program__tlsp___proposition_1b_bond_funds \\\n", + "4377 Unknown \n", + "\n", + " safe_routes_to_school__sr2s_and_srts_ \\\n", + "4377 Unknown \n", + "\n", + " set_aside_coordinated_border_infrastructure__cbi__program_under_fast_act \\\n", + "4377 Unknown \n", + "\n", + " solutions_for_congested_corridors_program__sccp_ special_programs \\\n", + "4377 Unknown Unknown \n", + "\n", + " state_local_partnership_program__slpp__and_local_partnership_program__lpp_formulaic_ \\\n", + "4377 Unknown \n", + "\n", + " structures_seismic_retrofit_ \\\n", + "4377 Unknown \n", + "\n", + " trade_corridor_enhancement_account__tcea__programs_–_local_share \\\n", + "4377 Unknown \n", + "\n", + " trade_corridor_enhancement_account__tcea__programs_–_state_share \\\n", + "4377 Unknown \n", + "\n", + " trade_corridors_improvement_fund__tcif__program_local_streets___roads \\\n", + "4377 Unknown \n", + "\n", + " traffic_congestion_relief_program___tcrp__ project_number \n", + "4377 Unknown 0b952b66e020 " + ] + }, + "execution_count": 74, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "all_projects[\n", - " (all_projects.county == \"Kern\")\n", - " & (all_projects.project_description.str.contains(\"Seal Coat\"))\n", - "].drop(columns=[\"location\"])" + "lp2000_project.loc[lp2000_project.project_id == \"5202(007)\"]" ] }, { "cell_type": "code", - "execution_count": null, - "id": "dc906308-31d4-4fde-b492-8218b05cec90", + "execution_count": 77, + "id": "03314dd0-426d-4e29-a6cf-6450757356ae", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "(11272, 11272)" + ] + }, + "execution_count": 77, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# all_projects.groupby(['project_category','auto_tagged_project_categories']).agg({'project_id':'nunique'})" + "len(lp2000_project), lp2000_project.project_id.nunique()" ] }, { "cell_type": "code", - "execution_count": null, - "id": "d2d6ac3a-c517-4df2-b907-0bac0a09e34a", + "execution_count": 81, + "id": "5380e43e-d74d-44a9-a1df-ffe1f916aefd", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "11263" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "all_projects.groupby([\"auto_tagged_project_categories\"]).agg(\n", - " {\"project_id\": \"nunique\"}\n", - ").sort_values(\"project_id\", ascending=False).head(10)" + "len(lp2000_project.drop(columns = ['project_id']).drop_duplicates())" ] }, { "cell_type": "code", - "execution_count": null, - "id": "5150da00-2a30-4f4d-bec8-1d9e5c66d623", + "execution_count": 82, + "id": "c0725be7-0bad-4e3e-b664-4db1ade8b3c5", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "9" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "all_projects.groupby([\"project_category\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n", - " \"project_id\", ascending=False\n", - ").head(10)" + "11272-11263" ] }, { - "cell_type": "code", - "execution_count": null, - "id": "1bf38631-a734-47b0-9465-fcfb8ebafcad", + "cell_type": "markdown", + "id": "fd3d068b-b95f-494d-bbbd-e0605c68f616", "metadata": {}, - "outputs": [], "source": [ - "all_projects.groupby([\"project_description\"]).agg(\n", - " {\"project_id\": \"nunique\"}\n", - ").sort_values(\"project_id\", ascending=False).head(10)" + "### CTIPS" ] }, { "cell_type": "code", "execution_count": null, - "id": "5c1baa16-e15c-48e7-9772-ef67755f9d21", + "id": "b8f5a6af-db4a-4d1a-9bcc-7e8324b61947", "metadata": {}, "outputs": [], "source": [ - "all_projects.groupby([\"county\"]).agg({\"project_id\": \"nunique\"}).sort_values(\n", - " \"project_id\", ascending=False\n", - ").head(10)" + "def load_ctips(file: str):\n", + " \n", + " df_project = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"project\")\n", + " )\n", + "\n", + " df_county = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"county\")\n", + " )\n", + "\n", + " df_district = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"district\")\n", + " )\n", + "\n", + " df_phase = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"phase_funding\")\n", + " )\n", + " \n", + " df_award = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"awards\")\n", + " )\n", + " \n", + " df_house = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"us_house\")\n", + " )\n", + " \n", + " df_senate = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"senate\")\n", + " )\n", + " \n", + " df_assembly = to_snakecase(\n", + " pd.read_excel(f\"{GCS_FILE_PATH}LP2000_CTIPS/{file}\", sheet_name=\"assembly\")\n", + " )\n", + " return df_project, df_county, df_district, df_phase, df_award, df_house, df_senate, df_assembly" ] }, { "cell_type": "code", "execution_count": null, - "id": "9d55e4ed-9b69-4111-b2ed-69715c9d90c5", + "id": "9372c9cb-a54a-4a2d-8f64-ca919f6b7b75", "metadata": {}, "outputs": [], "source": [ - "all_projects.lead_agency.nunique()" + "ctips_project, ctips_county, ctips_district, ctips_phase, ctips_award, ctips_house, ctips_senate, ctips_assembly = load_ctips('CTIPS.xlsx')" ] }, { "cell_type": "code", "execution_count": null, - "id": "01a534d9-75e4-4ff8-aa11-99db480de733", + "id": "1a53792b-c209-4303-a14f-59ea9ef03c9c", "metadata": {}, "outputs": [], "source": [ - "all_projects.total_project_cost.describe()" + "ctips_project.sample()" + ] + }, + { + "cell_type": "markdown", + "id": "4940bb3c-6170-4e12-a8ff-c4e97d7dbff2", + "metadata": {}, + "source": [ + "### State Rail Plan" ] }, { "cell_type": "code", "execution_count": null, - "id": "6985e5d0-cf27-423f-8775-16eb3c518beb", - "metadata": { - "tags": [] - }, + "id": "8a61f896-808c-44cc-ae3b-5c651bcee78e", + "metadata": {}, "outputs": [], "source": [ - "all_projects.loc[all_projects.fully_funded == \"Fully funded\"].groupby(\n", - " [\"data_source\"]\n", - ").agg({\"project_id\": \"nunique\"})" + "srp_df = har_utils.load_state_rail_plan()" ] }, { "cell_type": "code", "execution_count": null, - "id": "3259fc95-2db6-46ad-8cc6-a0357aa19077", + "id": "5469cc55-0034-469e-b199-991ca7ada378", "metadata": {}, "outputs": [], "source": [ - "all_projects.loc[all_projects.fully_funded == \"Partially funded\"].groupby(\n", - " [\"data_source\"]\n", - ").agg({\"project_id\": \"nunique\"})" + "srp_df = generate_alphanumeric_ids(srp_df, 8)" ] }, { "cell_type": "code", "execution_count": null, - "id": "2ef08825-9e29-4268-9172-d0d83e08243b", + "id": "95a7939f-4021-4b00-b036-07d5fed90a4a", "metadata": {}, "outputs": [], "source": [ - "all_projects.groupby([\"data_source\"]).agg({\"project_id\": \"nunique\"})" + "srp_df.sample()" ] }, { "cell_type": "code", "execution_count": null, - "id": "5fae701e-4132-4d06-8c27-3e598e072172", + "id": "b2bfef02-8d49-4a62-b49b-3807610f9fe4", "metadata": {}, "outputs": [], "source": [ - "all_projects.groupby([\"fully_funded\"]).agg(\n", - " {\"project_id\": \"nunique\"}\n", - ").reset_index().sort_values(\"project_id\", ascending=False)" + "srp_df_agency = separate_out_df(srp_df, ['project_number', 'lead_agency'])" ] }, { "cell_type": "code", "execution_count": null, - "id": "171611d6-acf9-46d8-9814-20534114d43e", + "id": "98052546-11e9-4e94-8cc1-0e334e5606f4", "metadata": {}, "outputs": [], - "source": [ - "all_projects.groupby([\"data_source\", \"fully_funded\"]).agg({\"project_id\": \"nunique\"})" - ] + "source": [] } ], "metadata": { diff --git a/project_list/ctips_01_18_2024.ipynb b/project_list/ctips_01_18_2024.ipynb deleted file mode 100644 index f7f4cb735..000000000 --- a/project_list/ctips_01_18_2024.ipynb +++ /dev/null @@ -1,1422 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "1b222a19", - "metadata": {}, - "source": [ - "## CTIPS\n", - "* https://ctips-prod.dot.ca.gov/ctips/LoginMediatorForm.do\n", - "\n", - "### To do\n", - "* Ask if DSHOPP means draft SHOPP project\n", - "* PROJSCHE - not a lot of matches" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "75094621", - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd \n", - "import sqlalchemy \n", - "import sys \n", - "import re\n", - "import oracledb " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "94838472", - "metadata": {}, - "outputs": [], - "source": [ - "oracledb.version = \"8.3.0\" \n", - "sys.modules[\"cx_Oracle\"] = oracledb " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a4a53471", - "metadata": {}, - "outputs": [], - "source": [ - "pd.options.display.max_columns = 100\n", - "pd.options.display.float_format = \"{:.2f}\".format\n", - "pd.set_option(\"display.max_rows\", None)\n", - "pd.set_option(\"display.max_colwidth\", None)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90056c61", - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "ENGINE_PATH_WIN_AUTH = f\"{DIALECT}://{USERNAME}:{PASSWORD}@{HOST}:{PORT}/?service_name={SERVICE}\" " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee70eded", - "metadata": {}, - "outputs": [], - "source": [ - "engine = sqlalchemy.create_engine(ENGINE_PATH_WIN_AUTH) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6665d753", - "metadata": {}, - "outputs": [], - "source": [ - "def to_snakecase(df):\n", - " df.columns = df.columns.str.lower().str.replace(' ','_')\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "id": "b429d5f5", - "metadata": {}, - "source": [ - "### Project\n", - "Project.agencyid = project sponsor\n", - "\n", - "Implpaed = Implementing Agency for PA&ED\n", - "\n", - "Implpse = Implementing Agency for PS&E\n", - "\n", - "implcon = Implementing Agency for Construction\n", - "\n", - "implrw = Implementing Agency for Right of Way\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "697a0653", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "appdate, \n", - "archive,\n", - "agencyid,\n", - "bond99,\n", - "cmia,\n", - "ctips_id,\n", - "const_date,\n", - "countyid,\n", - "countyid2,\n", - "countyid3,\n", - "chg_offcl,\n", - "chg_qual1,\n", - "chg_qual2,\n", - "districtid,\n", - "document,\n", - "docyear,\n", - "ea_number,\n", - "high_ver,\n", - "high_offcl,\n", - "implpaed, \n", - "implpse, \n", - "implrw, \n", - "implcon, \n", - "lupdate, \n", - "needpurpose,\n", - "progcode1,\n", - "ppno,\n", - "proj_desc,\n", - "postmiles1,\n", - "pm1b,\n", - "pm2b,\n", - "pm3b,\n", - "pm1a,\n", - "pm2a,\n", - "pm3a,\n", - "projcomp_date,\n", - "projectid,\n", - "route1,\n", - "route2,\n", - "route3,\n", - "rtl,\n", - "stip,\n", - "shopp,\n", - "title,\n", - "tcif,\n", - "tcrpno,\n", - "tcrp,\n", - "urbanid,\n", - "version\n", - "FROM ctips.project\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "485deb02", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "56598a58", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df.projectid.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db0c7bb6", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df.ctips_id.nunique()" - ] - }, - { - "cell_type": "markdown", - "id": "a1f4609c", - "metadata": {}, - "source": [ - "### A bit of cleaning" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1295f8de", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df = projects_df.fillna(projects_df.dtypes.replace({'float64': 0.0, 'object': 'None', 'int64': 0}))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cd3800ae", - "metadata": {}, - "outputs": [], - "source": [ - "string_cols = [col for col in projects_df.columns if projects_df[col].dtype == 'object']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "51ef6caf", - "metadata": {}, - "outputs": [], - "source": [ - "string_cols = [\n", - " 'needpurpose',\n", - " 'proj_desc',\n", - " 'route1',\n", - " 'title']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c069d0d", - "metadata": {}, - "outputs": [], - "source": [ - "for i in string_cols:\n", - " projects_df[i] = projects_df[i].str.title().str.lstrip().str.rstrip()\n", - " projects_df[i] = projects_df[i].replace(r'\\s+', ' ', regex=True)" - ] - }, - { - "cell_type": "markdown", - "id": "3a47796d", - "metadata": {}, - "source": [ - "### 1 row = 1 project \n", - "* Some projects don't have a high version?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "665641c4", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df2 = projects_df.sort_values(by = ['high_offcl', 'high_ver','archive'], ascending = [False, False, False])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a800db0c", - "metadata": {}, - "outputs": [], - "source": [ - "# Drop projects by ctips_id\n", - "projects_df3 = projects_df2.drop_duplicates(subset = ['ctips_id'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d1e5c25e", - "metadata": {}, - "outputs": [], - "source": [ - "# Filter out projects that are finished\n", - "projects_df3 = projects_df3.loc[projects_df3.archive == 0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83d42e8a", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df3.ctips_id.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3cee0056", - "metadata": {}, - "outputs": [], - "source": [ - "# Filter out any rows where chg_qual1==7 because those are projects that are deleted\n", - "projects_df3 = projects_df3[projects_df3.chg_qual1 != 7]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "afe30974", - "metadata": {}, - "outputs": [], - "source": [ - "len(projects_df3)" - ] - }, - { - "cell_type": "markdown", - "id": "9be1132f", - "metadata": {}, - "source": [ - "#### Ask if DSHOPP means draft shopp?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1de0e049", - "metadata": {}, - "outputs": [], - "source": [ - "projects_df3.document.unique()" - ] - }, - { - "cell_type": "markdown", - "id": "190d2323", - "metadata": {}, - "source": [ - "### PROJSCHE" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2ea89922", - "metadata": {}, - "outputs": [], - "source": [ - "projsche_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "projectid,\n", - "m020 AS pa_ed_begin,\n", - "m200a AS pa_ed_end,\n", - "m200b AS ps_e_begin,\n", - "m224 AS begin_row,\n", - "m410 AS end_row,\n", - "m500 AS con_start_date,\n", - "m600 AS con_end_date,\n", - "m700 AS begin_closeout,\n", - "m800 AS end_closeout\n", - "FROM ctips.projsche\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ff9fd013", - "metadata": {}, - "outputs": [], - "source": [ - "projsche_df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9d335f70", - "metadata": {}, - "outputs": [], - "source": [ - "projsche_df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a9ee8aa", - "metadata": {}, - "outputs": [], - "source": [ - "projsche_drop_cols = list(projsche_df.columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d6a08152", - "metadata": {}, - "outputs": [], - "source": [ - "projsche_drop_cols.remove('projectid')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cc2beec3", - "metadata": {}, - "outputs": [], - "source": [ - "# I want to drop the rows in which ALL values in the date columns are empty\n", - "projsche_df2 = projsche_df.dropna(how = \"all\", subset = projsche_drop_cols).reset_index(drop = True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb3109a9", - "metadata": {}, - "outputs": [], - "source": [ - "len(projsche_df2), len(projsche_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4ab6a530", - "metadata": {}, - "outputs": [], - "source": [ - "projsche_df2.projectid.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4c93886", - "metadata": {}, - "outputs": [], - "source": [ - "projsche_df2.info()" - ] - }, - { - "cell_type": "markdown", - "id": "b36bd39b", - "metadata": {}, - "source": [ - "#### Not a lot of matching values" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b323c658", - "metadata": {}, - "outputs": [], - "source": [ - "pd.merge(projsche_df2, projects_df3, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "724cb2bf", - "metadata": {}, - "outputs": [], - "source": [ - "pd.merge(projsche_df, projects_df3, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a590548b", - "metadata": {}, - "outputs": [], - "source": [ - "m1 = pd.merge(projects_df3, projsche_df2, on ='projectid', how = 'left')" - ] - }, - { - "cell_type": "markdown", - "id": "afdceff8", - "metadata": {}, - "source": [ - "### AGENCY" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f3e0d05b", - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "agency_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "name AS agency_name,\n", - "agencyid\n", - "FROM ctips.agncy\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "be3c31ba", - "metadata": {}, - "outputs": [], - "source": [ - "pd.merge(m1, agency_df, on ='agencyid', how = 'outer', indicator = True)[['_merge']].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1fd05b1a", - "metadata": {}, - "outputs": [], - "source": [ - "m2 = pd.merge(m1, agency_df, on ='agencyid', how = 'left')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "edd6dcd3", - "metadata": {}, - "outputs": [], - "source": [ - "agency_cols = ['agencyid', 'agency_name', 'implpaed', 'implpse', 'implrw', 'implcon']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4c67a81d", - "metadata": {}, - "outputs": [], - "source": [ - "m2.loc[m2.implpaed != \"None\"][agency_cols].sample()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0ceb0a3b", - "metadata": {}, - "outputs": [], - "source": [ - "m2.loc[m2.implrw != \"None\"][agency_cols].head()" - ] - }, - { - "cell_type": "markdown", - "id": "474ae25f", - "metadata": {}, - "source": [ - "### COUNTY" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "cfb16bcf", - "metadata": {}, - "outputs": [], - "source": [ - "county_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "name AS county_name,\n", - "countyid\n", - "FROM ctips.county\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a04fc040", - "metadata": {}, - "outputs": [], - "source": [ - "m3 = pd.merge(m2, county_df, on ='countyid', how = 'left')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a8a21f03", - "metadata": {}, - "outputs": [], - "source": [ - "m3.sample()" - ] - }, - { - "cell_type": "markdown", - "id": "b53cb205", - "metadata": {}, - "source": [ - "### FUNDLINE\n", - "* For action: Action: P = programmed, V= vote, A=award" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b6b5c018", - "metadata": {}, - "outputs": [], - "source": [ - "fundline_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "action,\n", - "con,\n", - "rw,\n", - "pe_paed,\n", - "pe_env,\n", - "pe_rw,\n", - "pe_con,\n", - "pe_total,\n", - "fundlineid,\n", - "fundtypeid,\n", - "line_year,\n", - "actiondate\n", - "FROM ctips.fundline\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60fdfd34", - "metadata": {}, - "outputs": [], - "source": [ - "fundline_df.fundlineid.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "db97ac33", - "metadata": {}, - "outputs": [], - "source": [ - "fundline_df.fundlineid.value_counts().head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "276178ce", - "metadata": {}, - "outputs": [], - "source": [ - "fundline_df.fundtypeid.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "89337dbb", - "metadata": {}, - "outputs": [], - "source": [ - "fundline_df.fundtypeid.value_counts().sample(5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "904088cb", - "metadata": {}, - "outputs": [], - "source": [ - "len(fundline_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5045315e", - "metadata": {}, - "outputs": [], - "source": [ - "fundline_df.action.value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "818684a9", - "metadata": {}, - "outputs": [], - "source": [ - "fundline_df.loc[fundline_df.fundtypeid == 20700009194]" - ] - }, - { - "cell_type": "markdown", - "id": "e2407ba7", - "metadata": {}, - "source": [ - "### Fundtype\n", - "* Fundtype.agencyid = funding agency" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f4efe574", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "fundtypeid,\n", - "fundid,\n", - "progcode,\n", - "programid,\n", - "projectid,\n", - "agencyid\n", - "FROM ctips.fundtype\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3efede49", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a3633eac", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_df.fundid.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "425a034a", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_df.projectid.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "048155d8", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_df.fundtypeid.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b80d227f", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_df.fundtypeid.value_counts().head()" - ] - }, - { - "cell_type": "markdown", - "id": "6784b287", - "metadata": {}, - "source": [ - "### Do the merges\n", - "#### Merge fundtype and fundline" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5c042959", - "metadata": {}, - "outputs": [], - "source": [ - "pd.merge(fundtype_df,\n", - " fundline_df, \n", - " on = ['fundtypeid'], \n", - " how = \"outer\",\n", - " indicator = True,)[['_merge']].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "121320f8", - "metadata": {}, - "outputs": [], - "source": [ - "fund_m1 = pd.merge(fundtype_df,fundline_df, on = ['fundtypeid'], how = \"left\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c52d768", - "metadata": {}, - "outputs": [], - "source": [ - "len(fund_m1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "567fa3bf", - "metadata": {}, - "outputs": [], - "source": [ - "fund_m1.head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dfc1128b", - "metadata": {}, - "outputs": [], - "source": [ - "fund_m1.projectid.nunique(), fund_m1.fundtypeid.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a03c9f1f", - "metadata": {}, - "outputs": [], - "source": [ - "fund_m1.fundtypeid.value_counts().head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f54aa5aa", - "metadata": {}, - "outputs": [], - "source": [ - "fund_m1.action.value_counts()" - ] - }, - { - "cell_type": "markdown", - "id": "bd6ec50d", - "metadata": {}, - "source": [ - "#### Merge subset of project with the merge above" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ae18d23b", - "metadata": {}, - "outputs": [], - "source": [ - "project_preview = ['ctips_id','projectid', 'high_ver', 'high_offcl']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "23be17fd", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_m2 = pd.merge(m3[project_preview], fund_m1, on = ['projectid'], how = \"inner\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71a47b1f", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_m2.projectid.value_counts().head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bb4336cd", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_m2.projectid.value_counts().describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "94b9fffe", - "metadata": {}, - "outputs": [], - "source": [ - "project_preview = project_preview + ['title']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "5be904f1", - "metadata": {}, - "outputs": [], - "source": [ - "fundtype_m2.columns" - ] - }, - { - "cell_type": "markdown", - "id": "8fd26495", - "metadata": {}, - "source": [ - "#### Aggregate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02e40aaa", - "metadata": {}, - "outputs": [], - "source": [ - "columns_to_agg = {**dict.fromkeys(['con', 'rw',\n", - " 'pe_paed', 'pe_env', 'pe_rw', 'pe_con', 'pe_total'], 'sum')}\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e34d14de", - "metadata": {}, - "outputs": [], - "source": [ - "columns_to_agg" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f527e5f2", - "metadata": {}, - "outputs": [], - "source": [ - "total_cost = fundtype_m2.groupby(['ctips_id','fundid','progcode','programid']).agg(columns_to_agg).reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ab78b69c", - "metadata": {}, - "outputs": [], - "source": [ - "total_cost.sample(3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c588a4fc", - "metadata": {}, - "outputs": [], - "source": [ - "total_cost.con.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b3f6c66", - "metadata": {}, - "outputs": [], - "source": [ - "total_cost.ctips_id.value_counts().head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5db3516", - "metadata": {}, - "outputs": [], - "source": [ - "len(total_cost), total_cost.ctips_id.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fbd926f7", - "metadata": {}, - "outputs": [], - "source": [ - "total_cost.loc[total_cost.ctips_id == 20600003977]" - ] - }, - { - "cell_type": "markdown", - "id": "710e214f", - "metadata": {}, - "source": [ - "### Progmain" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9afe9c87", - "metadata": {}, - "outputs": [], - "source": [ - "progmain_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "programid,\n", - "category AS program\n", - "FROM ctips.progmain\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "fea4fa65", - "metadata": {}, - "outputs": [], - "source": [ - "progmain_df.head()" - ] - }, - { - "cell_type": "markdown", - "id": "8c5a0735", - "metadata": {}, - "source": [ - "### Fund" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "df1da657", - "metadata": {}, - "outputs": [], - "source": [ - "fund_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "fund,\n", - "fundid\n", - "FROM ctips.fund\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "markdown", - "id": "b8971e64", - "metadata": {}, - "source": [ - "### Progsub" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "32dda1ed", - "metadata": {}, - "outputs": [], - "source": [ - "progsub_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "progcode,\n", - "progdesc\n", - "FROM ctips.progsub\n", - "\"\"\", engine) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "eb703a9c", - "metadata": {}, - "outputs": [], - "source": [ - "progsub_df.head(1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87344fdc", - "metadata": {}, - "outputs": [], - "source": [ - "progsub_df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "dced772d", - "metadata": {}, - "outputs": [], - "source": [ - "progsub_df.progcode.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8d9fe332", - "metadata": {}, - "outputs": [], - "source": [ - "double_ids = ['20.30.010.820',\n", - " '20.XX.723.000',\n", - " '20.30.010.810',\n", - " '20.XX.720.100',\n", - " '20.30.010.817',\n", - " '20.30.210.200'\n", - " ]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b9cb8e6", - "metadata": {}, - "outputs": [], - "source": [ - "progsub_df.loc[progsub_df.progcode.isin(double_ids)].sort_values('progcode')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "decf3559", - "metadata": {}, - "outputs": [], - "source": [ - "progsub_df2 = progsub_df.drop_duplicates(subset = ['progcode'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bf1cdbbe", - "metadata": {}, - "outputs": [], - "source": [ - "len(progsub_df)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1a7e9af2", - "metadata": {}, - "outputs": [], - "source": [ - "len(progsub_df2)" - ] - }, - { - "cell_type": "markdown", - "id": "6da3af8c", - "metadata": {}, - "source": [ - "#### Merge" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a945fa00", - "metadata": {}, - "outputs": [], - "source": [ - "final_fin_df = (total_cost.merge(progmain_df, on = ['programid'], how = \"left\")\n", - " .merge(fund_df, on =['fundid'], how = \"left\")\n", - " .merge(progsub_df2, on = ['progcode'], how = 'left'))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "21a2f425", - "metadata": {}, - "outputs": [], - "source": [ - "final_fin_df = final_fin_df.drop(columns = ['fundid', 'progcode','programid'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0289feb1", - "metadata": {}, - "outputs": [], - "source": [ - "final_fin_df.sample(3)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "83f92b9f", - "metadata": {}, - "outputs": [], - "source": [ - "final_fin_df.projectid.value_counts().head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "355bbeb4", - "metadata": {}, - "outputs": [], - "source": [ - "final_fin_df.projectid.value_counts().head()" - ] - }, - { - "cell_type": "markdown", - "id": "227aa0b4", - "metadata": {}, - "source": [ - "#### Find Total Cost\n", - "##### CLARIFY FTIP projects have `pe_total` value so figure out how to find the ftip projects and sum those up" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d07114d8", - "metadata": {}, - "outputs": [], - "source": [ - "total_cost = final_fin_df.groupby(['ctips_id']).agg(columns_to_agg).reset_index()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c72d2702", - "metadata": {}, - "outputs": [], - "source": [ - "# pe_test = total_cost.loc[(total_cost.pe_con != 0) & (total_cost.pe_env != 0) & (total_cost.pe_rw != 0) & (total_cost.pe_paed != 0)& (total_cost.pe_total != 0)]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b4f913ca", - "metadata": {}, - "outputs": [], - "source": [ - "total_cost['total_cost'] = total_cost.con + total_cost.rw + total_cost.pe_paed + total_cost.pe_env + total_cost.pe_rw + total_cost.pe_con" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "90d7e5c9", - "metadata": {}, - "outputs": [], - "source": [ - "# 6,638,471,000\n", - "total_cost['total_cost'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "74e1fd2c", - "metadata": {}, - "outputs": [], - "source": [ - "total_cost.sort_values(by = ['total_cost'], ascending = False).head()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b87393e7", - "metadata": {}, - "outputs": [], - "source": [ - "len(total_cost), total_cost.ctips_id.nunique()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8679ed80", - "metadata": {}, - "outputs": [], - "source": [ - "len(m3), m3.ctips_id.nunique()" - ] - }, - { - "cell_type": "markdown", - "id": "45ac21f4", - "metadata": {}, - "source": [ - "#### Agency name missing?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "79a85fc6", - "metadata": {}, - "outputs": [], - "source": [ - "m3.loc[m3.ctips_id == 20600002404]" - ] - }, - { - "cell_type": "markdown", - "id": "8f89fb4a", - "metadata": {}, - "source": [ - "### Political" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a57b8486", - "metadata": {}, - "outputs": [], - "source": [ - "political_df = pd.read_sql_query(\"\"\" \n", - "SELECT \n", - "assembly01,\n", - "ushouse01,\n", - "ssenate01,\n", - "projectid\n", - "FROM ctips.politcal\n", - "\"\"\", engine) \n", - "# Drop any rows with nulls\n", - "political_df = political_df.dropna(how = \"any\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ef3ebf38", - "metadata": {}, - "outputs": [], - "source": [ - "pd.merge(m3, political_df, on ='projectid', how = 'outer', indicator = True)[['_merge']].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "909827fb", - "metadata": {}, - "outputs": [], - "source": [ - "m4 = pd.merge(m3, political_df, on ='projectid', how = 'left')" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/project_list/sb125_list.ipynb b/project_list/sb125_list.ipynb index 8a05dc83b..44ca790c9 100644 --- a/project_list/sb125_list.ipynb +++ b/project_list/sb125_list.ipynb @@ -23,7 +23,7 @@ "import geopandas as gpd\n", "import pandas as pd\n", "from calitp_data_analysis import utils\n", - "from calitp_data_analysis.sql import to_snakecase" + "from calitp_data_analysis.sql import to_snakecase\n" ] }, { @@ -46,7 +46,7 @@ "metadata": {}, "outputs": [], "source": [ - "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/sb125/\"" + "GCS_FILE_PATH = \"gs://calitp-analytics-data/data-analyses/sb125/local_transit_list/\"" ] }, { @@ -198,7 +198,7 @@ "outputs": [], "source": [ "def load_srp():\n", - " df = har_utils.load_state_rail_plan()\n", + " df = srp_utils.clean_state_rail_plan(srp_utils.state_rail_plan_file)\n", " df[\"source\"] = \"State Rail Plan\"\n", " df[\"program\"] = \"State Rail Plan\"\n", " df[\"dds_phase\"] = \"Planned\"\n", @@ -232,6 +232,7 @@ "outputs": [], "source": [ "def load_sb1():\n", + " # Only includes in progress/incomplete projects\n", " df = sb1_utils.load_sb1()\n", " df[\"source\"] = \"SB1 Feature Server\"\n", " df[\"dds_phase\"] = \"Under Construction\"\n", @@ -292,131 +293,6 @@ "sb1_df.projprogram.value_counts()" ] }, - { - "cell_type": "code", - "execution_count": 11, - "id": "09da2d83-6fa4-42c3-a2c3-5569a6d4ec54", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['projectid', 'projname', 'projcatcode', 'projcategory', 'projprogcode',\n", - " 'projprogram', 'multiprogfunded', 'projstatus', 'description', 'cost',\n", - " 'assemblydistrict', 'senatedistrict', 'assemblycode', 'senatecode',\n", - " 'countyname', 'cityname', 'countycode', 'citycode', 'appagencyname',\n", - " 'impagencyname', 'geometry', 'totalcosts', 'routes', 'constyear',\n", - " 'costfull', 'projagency', 'source', 'dds_phase'],\n", - " dtype='object')" - ] - }, - "execution_count": 11, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sb1_df.columns" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "646e8e9e-a61c-45f6-880f-371952dd1843", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
projnamecosttotalcostscostfull
0Building Up Lossan North Improvement Program147930000NaN147930000.00
1All Aboard40412000NaN40412000.00
2DublinPleasanton Capacity Improvement And Congestion Reduction Program20500000NaN20500000.00
3Los Angeles Region Transit System Integration And Modernization Program Of Projects1088499000NaN1088499000.00
4Southwest Fresno Community Connector7798000NaN7798000.00
\n", - "
" - ], - "text/plain": [ - " projname \\\n", - "0 Building Up Lossan North Improvement Program \n", - "1 All Aboard \n", - "2 DublinPleasanton Capacity Improvement And Congestion Reduction Program \n", - "3 Los Angeles Region Transit System Integration And Modernization Program Of Projects \n", - "4 Southwest Fresno Community Connector \n", - "\n", - " cost totalcosts costfull \n", - "0 147930000 NaN 147930000.00 \n", - "1 40412000 NaN 40412000.00 \n", - "2 20500000 NaN 20500000.00 \n", - "3 1088499000 NaN 1088499000.00 \n", - "4 7798000 NaN 7798000.00 " - ] - }, - "execution_count": 12, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sb1_df[[\"projname\", \"cost\", \"totalcosts\", \"costfull\"]].head()" - ] - }, { "cell_type": "markdown", "id": "01a9b959-4980-4f01-a846-d2ee187483e7", @@ -427,7 +303,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 11, "id": "1725c8ef-4f4a-4853-a0da-e522d2c66b8d", "metadata": {}, "outputs": [], @@ -441,7 +317,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 12, "id": "1d61aabb-01c4-4dce-adc4-58fb8f34663a", "metadata": {}, "outputs": [], @@ -451,7 +327,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 13, "id": "c730a842-c9ff-42f4-a6b6-486448990623", "metadata": {}, "outputs": [ @@ -581,7 +457,7 @@ "1 NaN NaN Blackcat Under Construction " ] }, - "execution_count": 15, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -592,7 +468,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 14, "id": "fa78dddc-c348-4ace-b04a-711adcc0c489", "metadata": {}, "outputs": [], @@ -600,6 +476,12 @@ "def aggregate_to_one_line(\n", " df: pd.DataFrame, column_to_group: str, column_to_summarize: str\n", "):\n", + " \"\"\"\n", + " Aggregate all values onto one line by one goruping val.\n", + " Ex: project ABC has two rows because it has two values for the \"fund column\"\n", + " as it receives money from fund 1 and fund 2. This function will\n", + " combine fund 1 and fund 2 to fund 1, fund2 into one row.\n", + " \"\"\"\n", " df[f\"new_{column_to_summarize}\"] = df.groupby(column_to_group)[\n", " column_to_summarize\n", " ].transform(lambda x: \",\".join(x))\n", @@ -611,26 +493,27 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 15, "id": "84638089-3c86-46fa-9911-5ac18991de5d", "metadata": {}, "outputs": [], "source": [ "def load_lp2000(file: str):\n", + " LP2000_PATH = \"gs://calitp-analytics-data/data-analyses/project_list/LP2000_CTIPS/\"\n", " df_project = to_snakecase(\n", - " pd.read_excel(f\"{GCS_FILE_PATH}{file}\", sheet_name=\"project\")\n", + " pd.read_excel(f\"{LP2000_PATH}{file}\", sheet_name=\"project\")\n", " )\n", "\n", " df_county = to_snakecase(\n", - " pd.read_excel(f\"{GCS_FILE_PATH}{file}\", sheet_name=\"county\")\n", + " pd.read_excel(f\"{LP2000_PATH}{file}\", sheet_name=\"county\")\n", " ).drop(columns=[\"project_label_name\"])\n", "\n", " df_district = to_snakecase(\n", - " pd.read_excel(f\"{GCS_FILE_PATH}{file}\", sheet_name=\"district\")\n", + " pd.read_excel(f\"{LP2000_PATH}{file}\", sheet_name=\"district\")\n", " ).drop(columns=[\"project_label_name\"])\n", "\n", " df_award = to_snakecase(\n", - " pd.read_excel(f\"{GCS_FILE_PATH}{file}\", sheet_name=\"awards\")\n", + " pd.read_excel(f\"{LP2000_PATH}{file}\", sheet_name=\"awards\")\n", " )\n", "\n", " # Clean up awards so if project has multiple entries, this is all\n", @@ -678,12 +561,12 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 16, "id": "80a62aa6-79df-411c-b87f-c18cefa37af4", "metadata": {}, "outputs": [], "source": [ - "lp2000_df = load_lp2000(\"LP2000.xlsx\")" + "lp2000_df = load_lp2000(\"LP2000_projects.xlsx\")" ] }, { @@ -696,7 +579,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 17, "id": "8192764f-6b2c-41e6-892e-0b3debace384", "metadata": {}, "outputs": [], @@ -722,7 +605,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 18, "id": "d11cb637-71b5-4dde-81cd-84dae5f79ff4", "metadata": {}, "outputs": [], @@ -846,7 +729,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 19, "id": "ef2297c5-825c-49be-a892-3081052516c4", "metadata": {}, "outputs": [ @@ -854,7 +737,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" ] } @@ -883,7 +766,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 20, "id": "eb25aa9a-92ad-4b95-bb62-6b3b0bca3495", "metadata": {}, "outputs": [ @@ -929,11 +812,11 @@ " \n", " \n", " \n", - " 10447\n", + " 10448\n", " No Title\n", " Scag\n", - " Westlake Macarthur Park Pedestrian Improvement Project. Install Pedestrian Improvements Incl Pedestrian Lighting, Sidewalk Enhancements, Street Furniture & Trees, Enhanced Crosswalks, & Bus Stop Amenities.\n", - " 1674000.00\n", + " Western Av Bus Stop & Pedestrian Improvement Project. Install Pedestrian And Transit Amenities To Enhance The Pedestrian Environment Along Western Av Btw Exposition Bl & I-10 Freeway.\n", + " 1472000.00\n", " 0.00\n", " Partially Funded\n", " None\n", @@ -945,7 +828,7 @@ " None\n", " None\n", " Scag Lrtp\n", - " notes: System: Local Highway, Route #: 0, Route Name: Nan, From: Union, To: Hoover\n", + " notes: System: Local Highway, Route #: 0, Route Name: Nan, From: Exposition, To: I-10\n", " \n", " \n", " \n", @@ -954,28 +837,28 @@ ], "text/plain": [ " project_title lead_agency \\\n", - "10447 No Title Scag \n", + "10448 No Title Scag \n", "\n", - " project_description \\\n", - "10447 Westlake Macarthur Park Pedestrian Improvement Project. Install Pedestrian Improvements Incl Pedestrian Lighting, Sidewalk Enhancements, Street Furniture & Trees, Enhanced Crosswalks, & Bus Stop Amenities. \n", + " project_description \\\n", + "10448 Western Av Bus Stop & Pedestrian Improvement Project. Install Pedestrian And Transit Amenities To Enhance The Pedestrian Environment Along Western Av Btw Exposition Bl & I-10 Freeway. \n", "\n", " total_project_cost total_available_funds phase post_mile \\\n", - "10447 1674000.00 0.00 Partially Funded None \n", + "10448 1472000.00 0.00 Partially Funded None \n", "\n", " county city ct_district project_start_year project_completion_year \\\n", - "10447 None None None None None \n", + "10448 None None None None None \n", "\n", " geometry grant_program source \\\n", - "10447 None None Scag Lrtp \n", + "10448 None None Scag Lrtp \n", "\n", - " notes \\\n", - "10447 notes: System: Local Highway, Route #: 0, Route Name: Nan, From: Union, To: Hoover \n", + " notes \\\n", + "10448 notes: System: Local Highway, Route #: 0, Route Name: Nan, From: Exposition, To: I-10 \n", "\n", " funding_notes \n", - "10447 " + "10448 " ] }, - "execution_count": 22, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -986,7 +869,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 21, "id": "d2d15a6b-9c44-4124-8d60-da74a9180c52", "metadata": {}, "outputs": [ @@ -994,7 +877,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" ] } @@ -1032,7 +915,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 22, "id": "a7102189-3816-4423-9220-0dc340603b37", "metadata": {}, "outputs": [ @@ -1040,7 +923,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" ] } @@ -1084,7 +967,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 23, "id": "5648d023-4b18-4d8c-a7d2-0a43bc105649", "metadata": {}, "outputs": [ @@ -1095,7 +978,7 @@ "Name: grant_program, dtype: int64" ] }, - "execution_count": 25, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } @@ -1106,7 +989,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 24, "id": "a2d74753-0c1c-4a40-a5ed-761074112b13", "metadata": {}, "outputs": [ @@ -1114,7 +997,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" ] } @@ -1151,7 +1034,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 25, "id": "d588d001-9707-472f-a9bb-5dbf2cfd0d95", "metadata": {}, "outputs": [ @@ -1159,7 +1042,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_928/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", + "/tmp/ipykernel_2642/4025938624.py:67: UserWarning: This pattern is interpreted as a regular expression, and has match groups. To actually get the groups, use str.extract.\n", " cost_columns = df.columns[df.columns.str.contains(\"(cost|funds)\")].tolist()\n" ] } @@ -1201,22 +1084,23 @@ "metadata": {}, "source": [ "### Stack\n", - "TO DO\n", + "Waiting\n", "* Clarify the monetary cols of SB1 & BlackCat\n", - "* Harmonize county/city/lead agency names\n", - "* LRTP grant program should be none'\n", - "* Categorize it?\n", "\n", "Christian's Notes\n", "* What amount of transit related projects are in this big list? \n", "* How big the projects are by cost? \n", "* Compare the cost of all the transit projects against all the projects in the list?\n", - "* Use percentages." + "* Use percentages.\n", + "\n", + "Done\n", + "* Harmonize county/city/lead agency names\n", + "* LRTP grant program should be none'" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 26, "id": "2ebe704e-3375-4be9-bb2c-5ee6079ba0d3", "metadata": {}, "outputs": [], @@ -1235,7 +1119,7 @@ }, { "cell_type": "code", - "execution_count": 78, + "execution_count": 27, "id": "189437cf-117a-4b8a-a89f-ae68fe988cc4", "metadata": {}, "outputs": [], @@ -1251,17 +1135,17 @@ " df[column] = df[column].replace(r\"\\s+\", \" \", regex=True)\n", "\n", " # Remove specific characters\n", - " chars_to_remove = [\"-\", \"/\", \")\", \"(\", \".\", 'County', 'Of','District']\n", + " chars_to_remove = [\"-\", \"/\", \")\", \"(\", \".\", \"County\", \"Of\", \"District\"]\n", " for char in chars_to_remove:\n", " df[column] = df[column].str.replace(char, \"\")\n", - " \n", - " df[column] = df[column].astype(str).replace('\\d+', '', regex=True)\n", + "\n", + " df[column] = df[column].astype(str).replace(\"\\d+\", \"\", regex=True)\n", " return df" ] }, { "cell_type": "code", - "execution_count": 79, + "execution_count": 28, "id": "6c911fdc-7bf4-4529-91bd-dcfe6667ee78", "metadata": {}, "outputs": [ @@ -1269,28 +1153,28 @@ "name": "stderr", "output_type": "stream", "text": [ - "/tmp/ipykernel_928/180325038.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", + "/tmp/ipykernel_2642/309772486.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", " df[column] = df[column].str.replace(char, \"\")\n" ] } ], "source": [ - "complete = clean_strings(complete, 'lead_agency')" + "complete = clean_strings(complete, \"lead_agency\")" ] }, { "cell_type": "code", - "execution_count": 80, + "execution_count": 29, "id": "7d458a7b-63ee-428a-9a81-61cf32b88e7d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1031" + "1056" ] }, - "execution_count": 80, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -1301,17 +1185,17 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 30, "id": "ccaacd62-b205-42d7-9338-bba3fdb27404", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "29381" + "29420" ] }, - "execution_count": 29, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } @@ -1322,14 +1206,14 @@ }, { "cell_type": "code", - "execution_count": 30, + "execution_count": 31, "id": "4d391464-d622-44ae-99ec-3f6829ccc589", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "LP2000 11233\n", + "LP2000 11272\n", "Blackcat 3385\n", "Fresno Cog Lrtp 3147\n", "Scag Lrtp 2952\n", @@ -1354,7 +1238,7 @@ "Name: source, dtype: int64" ] }, - "execution_count": 30, + "execution_count": 31, "metadata": {}, "output_type": "execute_result" } @@ -1363,276 +1247,389 @@ "complete.source.value_counts()" ] }, + { + "cell_type": "markdown", + "id": "3be25882-b1fb-47a5-9c3e-7fb9ea11e37b", + "metadata": {}, + "source": [ + "#### Try to find duplicated projects" + ] + }, { "cell_type": "code", - "execution_count": 58, - "id": "a6a748ee-6712-48fc-b607-4501823d3e58", + "execution_count": 32, + "id": "f47bff65-2a07-41b1-b5ee-8bf27de8b1fc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_titlelead_agencyproject_descriptiontotal_project_costtotal_available_fundsphasepost_milecountycityct_districtproject_start_yearproject_completion_yeargeometrygrant_programsourcenotesfunding_notes
12136No TitleScagWiden Riverside Dr From Pipeline Ave To Fern Ave From 4 To 6 Lanes5089000.000.00Partially FundedNoneNoneNoneNoneNoneNoneNoneNoneScag Lrtpnotes: System: Local Highway, Route #: 0, Route Name: Riverside Dr, From: Pipeline Ave, To: Fern Ave
\n", + "
" + ], + "text/plain": [ + " project_title lead_agency \\\n", + "12136 No Title Scag \n", + "\n", + " project_description \\\n", + "12136 Widen Riverside Dr From Pipeline Ave To Fern Ave From 4 To 6 Lanes \n", + "\n", + " total_project_cost total_available_funds phase post_mile \\\n", + "12136 5089000.00 0.00 Partially Funded None \n", + "\n", + " county city ct_district project_start_year project_completion_year \\\n", + "12136 None None None None None \n", + "\n", + " geometry grant_program source \\\n", + "12136 None None Scag Lrtp \n", + "\n", + " notes \\\n", + "12136 notes: System: Local Highway, Route #: 0, Route Name: Riverside Dr, From: Pipeline Ave, To: Fern Ave \n", + "\n", + " funding_notes \n", + "12136 " + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "complete.sample()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "57a6ba28-e795-4bf2-a1b3-83f747d5c7b2", + "metadata": {}, + "outputs": [], + "source": [ + "projects_main_info = complete.project_title + '-' + complete.project_description + '-' + complete.source + '-' + complete.county + '-' + complete.notes + '-' + complete.total_project_cost.astype(str)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "d22e2d61-847f-4c7d-b542-04fb1ef65b8a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "1062" + "pandas.core.series.Series" ] }, - "execution_count": 58, + "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(complete[['lead_agency']].sort_values(by = ['lead_agency']).drop_duplicates())" + "type(projects_main_info)" ] }, { "cell_type": "code", - "execution_count": 134, - "id": "b1de1321-4c54-4acd-b649-4a969d0b02a5", + "execution_count": 35, + "id": "8c79d083-50cd-4baf-acd9-02279e255ac6", "metadata": {}, "outputs": [], "source": [ - "transit_list = [\n", - " \"buses\",\n", - " \"van\",\n", - " \"light rail\",\n", - " \"light rail vehicles\",\n", - " \"lrv\",\n", - " \"train\",\n", - " \"bus\",\n", - " \"rail\",\n", - " \"locomotives\",\n", - " \"ferry\",\n", - " \"vessels\",\n", - " \"trolley\",\n", - " \"vehicles\",\n", - " \"emus\",\n", - " \"trolleys\",\n", - " \"turnouts\",\n", - " \"routes\",\n", - " \"station\",\n", - " \"signals\",\n", - " \"facility\",\n", - " \"locations\",\n", - " \"congestion\",\n", - " \"rideshare\",\n", - " \"ridesharing\",\n", - " \"vanpool\",\n", - " \"high quality transit areas\",\n", - " \"hqta\",\n", - " \"car share\",\n", - " \"bus\",\n", - " \"metro\",\n", - " \"station\", # Station comes up a few times as a charging station and also as a train station\n", - " \"transit\",\n", - " \"fare\",\n", - " \"brt\",\n", - " \"yarts\",\n", - " \"railroad\",\n", - " \"rider\",\n", - " \"highway-rail\",\n", - " \"bike\",\n", - " \"bicycle\",\n", - " 'bus rapid transit',\n", - " 'transit-oriented development',\n", - " 'commuter rail',\n", - " 'bus stop',\n", - " 'shuttle',\n", - " 'mobility hub',\n", - " 'fare evasion',\n", - " 'park and ride',\n", - " 'bus lane',\n", - " 'bicycle lane',\n", - " 'multimodal',\n", - " 'farebox',\n", - " 'transfer',\n", - " 'intermodal',\n", - " 'paratransit',\n", - " 'bus route',\n", - " 'express bus',\n", - " 'bus terminal',\n", - " 'bus shelter',\n", - " 'bus depot',\n", - " 'bus service',\n", - " 'transit agency',\n", - " 'fare collection',\n", - " 'fare structure',\n", - " 'fare card',\n", - " 'transit signal priority',\n", - " 'bus rapid transit',\n", - " 'fare integration',\n", - " 'transportation equity',\n", - " 'mobility as a service',\n", - " 'fare subsidy',\n", - " 'fare payment',\n", - " 'integrated transit',\n", - " 'automated transit',\n", - " 'fare technology',\n", - " 'real-time transit',\n", - " 'mobility management',\n", - " 'bus network',\n", - " 'rail network',\n", - " 'public transportation',\n", - " 'commute',\n", - " \"cyclist\",\n", - " \"pedestrian\",\n", - " ## including the spelling errors of `pedestrian`\n", - " \"pedestrain\",\n", - " \"crosswalk\",\n", - " \"bulb out\",\n", - " \"bulb-out\",\n", - " \"active transp\",\n", - " \"traffic reduction\",\n", - " \"speed reduction\",\n", - " \"ped\",\n", - " \"srts\",\n", - " \"safe routes to school\",\n", - " \"sidewalk\",\n", - " \"side walk\",\n", - " \"trail\",\n", - " \"atp\",\n", - "]" + "\n", + "main_info = projects_main_info.to_frame()" ] }, { "cell_type": "code", - "execution_count": 135, - "id": "d112f04a-8aac-4871-9b84-9daba4b596b7", + "execution_count": 36, + "id": "92a926d4-90c8-41a3-9b62-19ab0d81ba2e", "metadata": {}, "outputs": [], "source": [ - "# Remove duplicates\n", - "cleaned_transit_list = list(set(transit_list))" + "main_info = main_info.rename(columns = {0:'project_info'})" ] }, { "cell_type": "code", - "execution_count": 136, - "id": "0fa39db5-a4f8-4702-8774-254aa167181e", + "execution_count": 37, + "id": "b607cb21-0735-46ee-9f32-840d31e09d9f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['intermodal',\n", - " 'transfer',\n", - " 'vanpool',\n", - " 'pedestrian',\n", - " 'bulb-out',\n", - " 'integrated transit',\n", - " 'mobility management',\n", - " 'trolleys',\n", - " 'bus depot',\n", - " 'congestion',\n", - " 'sidewalk',\n", - " 'shuttle',\n", - " 'bus shelter',\n", - " 'trolley',\n", - " 'fare evasion',\n", - " 'signals',\n", - " 'metro',\n", - " 'brt',\n", - " 'mobility as a service',\n", - " 'fare structure',\n", - " 'van',\n", - " 'light rail vehicles',\n", - " 'rail network',\n", - " 'bulb out',\n", - " 'bus terminal',\n", - " 'lrv',\n", - " 'bicycle lane',\n", - " 'pedestrain',\n", - " 'yarts',\n", - " 'rideshare',\n", - " 'car share',\n", - " 'trail',\n", - " 'park and ride',\n", - " 'fare integration',\n", - " 'crosswalk',\n", - " 'ridesharing',\n", - " 'paratransit',\n", - " 'commuter rail',\n", - " 'speed reduction',\n", - " 'multimodal',\n", - " 'turnouts',\n", - " 'srts',\n", - " 'rider',\n", - " 'side walk',\n", - " 'fare subsidy',\n", - " 'transit signal priority',\n", - " 'train',\n", - " 'transportation equity',\n", - " 'rail',\n", - " 'commute',\n", - " 'light rail',\n", - " 'bus route',\n", - " 'safe routes to school',\n", - " 'fare collection',\n", - " 'ped',\n", - " 'buses',\n", - " 'locations',\n", - " 'cyclist',\n", - " 'farebox',\n", - " 'public transportation',\n", - " 'high quality transit areas',\n", - " 'transit-oriented development',\n", - " 'emus',\n", - " 'facility',\n", - " 'transit agency',\n", - " 'real-time transit',\n", - " 'railroad',\n", - " 'routes',\n", - " 'active transp',\n", - " 'atp',\n", - " 'vessels',\n", - " 'automated transit',\n", - " 'highway-rail',\n", - " 'bus rapid transit',\n", - " 'fare payment',\n", - " 'fare',\n", - " 'bus lane',\n", - " 'mobility hub',\n", - " 'transit',\n", - " 'traffic reduction',\n", - " 'ferry',\n", - " 'bus stop',\n", - " 'bus network',\n", - " 'fare technology',\n", - " 'express bus',\n", - " 'bicycle',\n", - " 'bike',\n", - " 'locomotives',\n", - " 'station',\n", - " 'bus',\n", - " 'bus service',\n", - " 'vehicles',\n", - " 'fare card',\n", - " 'hqta']" + "29420" ] }, - "execution_count": 136, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "cleaned_transit_list" + "len(main_info)" ] }, { "cell_type": "code", - "execution_count": 137, - "id": "c959117b-091d-4610-b36f-202f3dd97c9e", + "execution_count": 38, + "id": "958fabea-d9c7-4e70-9c18-53ee5792606d", "metadata": {}, "outputs": [], "source": [ - "def filter_projects(\n", - " df,\n", - " columns_to_search: list,\n", - " keywords_search: list,\n", - " file_name: str,\n", - " gcs_path: str,\n", - " projects_to_del: list,\n", - "):\n", + "# Assuming main_info.project_info.value_counts() gives you a Series\n", + "value_counts_series = main_info.project_info.value_counts()\n", + "\n", + "# Convert the Series to a DataFrame with columns 'value' and 'count'\n", + "df_value_counts = value_counts_series.reset_index()\n", + "df_value_counts.columns = ['project_info', 'total_values']" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "063ff71f-032a-4564-8398-cfbd982859ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "28228" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_value_counts)" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "35cf6fe5-dba8-46c4-8a47-15d4eabac99f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 28228.00\n", + "mean 1.04\n", + "std 0.62\n", + "min 1.00\n", + "25% 1.00\n", + "50% 1.00\n", + "75% 1.00\n", + "max 34.00\n", + "Name: total_values, dtype: float64" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_value_counts.total_values.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "afd35c1b-d15b-4b7a-98e4-2c8ba9ab175b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "403" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(df_value_counts.loc[df_value_counts.total_values > 1])" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "320fe207-3176-4f70-85a5-52b627d12dc1", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
project_infototal_values
0None-Purchase Replacement Van-Blackcat-None- grant fiscal year: 2019, grant encumbered amount: 56000.0, local encumbered amount: 0.0, total encumbered amount: 56000.0, expendedamount: 0.0, activebalance: 44800.0, closedoutbalance: 0, project status: Open-0.034
1None-Purchase Replacement Van-Blackcat-None- grant fiscal year: 2019, grant encumbered amount: 56000.0, local encumbered amount: 0.0, total encumbered amount: 56000.0, expendedamount: 0.0, activebalance: 11200.0, closedoutbalance: 0, project status: Open-0.034
2None-None-Sandag Lrtp-None- notes: Category: Nan, Status: Nan, Aqc 2016 1: Nan, Aqc 2020 1: Nan, Pricmcp: Nan, Conncmcp: Nan, Layer Name: Mobility Hubs And Flexible Fleets, Corridor I: Nan, Type 1: Nan, Existing: Nan, Limits: Nan, Description 1: Nan, Route: Nan, Routetype: Nan, Route Desc: Nan, Rp 2021 Id: Nan, Rp 2021 Id 1: Nan, Capital Cost 2020 Millions: Nan-0.031
3None-Purchase Replacement < 30 Ft Bus-Blackcat-None- grant fiscal year: 2021, grant encumbered amount: 84000.0, local encumbered amount: 0.0, total encumbered amount: 84000.0, expendedamount: 0.0, activebalance: 84000.0, closedoutbalance: 0, project status: Open-0.029
4Emergency Opening-None-LP2000-Tulare County- location name: nan, route name: 0-CR, state hwy ind: N, senate district code: nan, category desc: Emergency Opening, district code: 6.0, comment desc: Emergency Opening, postmile combined: nan-0.022
\n", + "
" + ], + "text/plain": [ + " project_info \\\n", + "0 None-Purchase Replacement Van-Blackcat-None- grant fiscal year: 2019, grant encumbered amount: 56000.0, local encumbered amount: 0.0, total encumbered amount: 56000.0, expendedamount: 0.0, activebalance: 44800.0, closedoutbalance: 0, project status: Open-0.0 \n", + "1 None-Purchase Replacement Van-Blackcat-None- grant fiscal year: 2019, grant encumbered amount: 56000.0, local encumbered amount: 0.0, total encumbered amount: 56000.0, expendedamount: 0.0, activebalance: 11200.0, closedoutbalance: 0, project status: Open-0.0 \n", + "2 None-None-Sandag Lrtp-None- notes: Category: Nan, Status: Nan, Aqc 2016 1: Nan, Aqc 2020 1: Nan, Pricmcp: Nan, Conncmcp: Nan, Layer Name: Mobility Hubs And Flexible Fleets, Corridor I: Nan, Type 1: Nan, Existing: Nan, Limits: Nan, Description 1: Nan, Route: Nan, Routetype: Nan, Route Desc: Nan, Rp 2021 Id: Nan, Rp 2021 Id 1: Nan, Capital Cost 2020 Millions: Nan-0.0 \n", + "3 None-Purchase Replacement < 30 Ft Bus-Blackcat-None- grant fiscal year: 2021, grant encumbered amount: 84000.0, local encumbered amount: 0.0, total encumbered amount: 84000.0, expendedamount: 0.0, activebalance: 84000.0, closedoutbalance: 0, project status: Open-0.0 \n", + "4 Emergency Opening-None-LP2000-Tulare County- location name: nan, route name: 0-CR, state hwy ind: N, senate district code: nan, category desc: Emergency Opening, district code: 6.0, comment desc: Emergency Opening, postmile combined: nan-0.0 \n", + "\n", + " total_values \n", + "0 34 \n", + "1 34 \n", + "2 31 \n", + "3 29 \n", + "4 22 " + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df_value_counts.loc[df_value_counts.total_values > 1].sort_values(by = ['total_values'], ascending = False).head()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "efe2427d-adfb-47ca-994e-83f442dedfa3", + "metadata": {}, + "outputs": [], + "source": [ + "# complete.loc[(complete.source == \"LP2000\") & (complete.project_title == \"Emergency Opening\") & (complete.county == \"Tulare County\")]." + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "c959117b-091d-4610-b36f-202f3dd97c9e", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_projects(\n", + " df,\n", + " columns_to_search: list,\n", + " keywords_search: list,\n", + " file_name: str,\n", + " gcs_path: str,\n", + " projects_to_del: list,\n", + "):\n", "\n", " # Filter out for Cordon\n", " df = _specific_list_utils.find_keywords(df, columns_to_search, keywords_search)\n", @@ -1661,7 +1658,97 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 45, + "id": "e3325be6-ab38-4dd9-9a7f-17c05972bef7", + "metadata": {}, + "outputs": [], + "source": [ + "transit_terms = [\n", + " \"automated transit\",\n", + " \"brt\",\n", + " \"bus\",\n", + " \"bus depot\",\n", + " \"bus lane\",\n", + " \"bus lanes\",\n", + " \"bus network\",\n", + " \"bus rapid transit\",\n", + " \"bus route\",\n", + " \"bus routes\",\n", + " \"bus service\",\n", + " \"bus shelter\",\n", + " \"bus stop\",\n", + " \"bus terminal\",\n", + " \"buses\",\n", + " \"commuter rail\",\n", + " \"express bus\",\n", + " \"fare card\",\n", + " \"fare collection\",\n", + " \"fare evasion\",\n", + " \"fare integration\",\n", + " \"fare payment\",\n", + " \"fare structure\",\n", + " \"fare subsidy\",\n", + " \"fare technology\",\n", + " \"farebox\",\n", + " \"ferry\",\n", + " \"ferrys\",\n", + " \"high quality transit areas\",\n", + " \"integrated transit\",\n", + " \"intermodal\",\n", + " \"light rail\",\n", + " \"light rail vehicles\",\n", + " \"locomotives\",\n", + " \"mobility as a service\",\n", + " \"mobility hub\",\n", + " \"multimodal\",\n", + " \"paratransit\",\n", + " \"rail\",\n", + " \"rail network\",\n", + " \"railroad\",\n", + " \"shuttle\",\n", + " \"shuttles\",\n", + " \"station\",\n", + " \"terminal\",\n", + " \"train\",\n", + " \"trains\",\n", + " \"transit\",\n", + " \"transit agency\",\n", + " \"transit center\",\n", + " \"transit hub\",\n", + " \"transit signal priority\",\n", + " \"transit-oriented development\",\n", + " \"transportation equity\",\n", + " \"trolley\",\n", + " \"trolleys\",\n", + " \"van\",\n", + " \"vans\"\n", + "]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "e00e746a-acdb-4232-8916-24159e10c7fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "58" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(transit_terms)" + ] + }, + { + "cell_type": "code", + "execution_count": 47, "id": "44233fea-c8a1-4f48-b96c-1f74b58b083c", "metadata": {}, "outputs": [ @@ -1669,6 +1756,8 @@ "name": "stderr", "output_type": "stream", "text": [ + "/home/jovyan/data-analyses/project_list/_specific_list_utils.py:18: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", + " df[i]\n", "/home/jovyan/data-analyses/project_list/_specific_list_utils.py:18: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.\n", " df[i]\n" ] @@ -1681,7 +1770,7 @@ " \"project_title\",\n", " \"project_description\",\n", " ],\n", - " cleaned_transit_list,\n", + " transit_terms,\n", " \"sb125_transit\",\n", " GCS_FILE_PATH,\n", " [],\n", @@ -1690,17 +1779,17 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 48, "id": "45ac2172-9b5f-4506-b082-3277e7ddb280", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(10672, 19)" + "(4186, 19)" ] }, - "execution_count": 139, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -1719,28 +1808,28 @@ }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 49, "id": "74c46a32-087b-41d1-801b-289a1ae54b90", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'36% of projects are related to Transit'" + "'14% or 4186 of projects in this list are related to Transit'" ] }, - "execution_count": 140, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "f\"{(int(len(transit_df) / len(complete) * 100))}% of projects are related to Transit\"" + "f\"{(int(len(transit_df) / len(complete) * 100))}% or {len(transit_df)} of projects in this list are related to Transit\"" ] }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 50, "id": "a83ec341-f24c-4e47-800a-1889c48c9d8a", "metadata": {}, "outputs": [], @@ -1750,7 +1839,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 51, "id": "a9c90ab2-be8a-44a3-8dad-372a0b9e762e", "metadata": {}, "outputs": [], @@ -1760,7 +1849,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 52, "id": "9aa09882-5c2c-40f0-a93a-e74abd5d2916", "metadata": {}, "outputs": [], @@ -1770,7 +1859,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 53, "id": "ec85ce97-a0d7-469f-9138-caba4711a37d", "metadata": {}, "outputs": [], @@ -1780,60 +1869,60 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 54, "id": "a33a45f1-3047-4178-9c24-6a2384fece0a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "'The total estimated cost is $333,288,985,140 compared to $669,784,021,877 in all the compiled projects. This makes up 49% of the requested funding'" + "'The total estimated cost is $299,572,489,073 compared to $670,035,689,953 in all the compiled projects. This makes up 44% of the requested funding (that we have on file).'" ] }, - "execution_count": 145, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "f\"The total estimated cost is ${formatted_total_cost} compared to ${formatted_projects_cost} in all the compiled projects. This makes up {int((transit_cost/total_projects_cost) * 100)}% of the requested funding\"" + "f\"The total estimated cost is ${formatted_total_cost} compared to ${formatted_projects_cost} in all the compiled projects. This makes up {int((transit_cost/total_projects_cost) * 100)}% of the requested funding (that we have on file).\"" ] }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 55, "id": "57514f7f-cece-467b-a4c2-d4a891c1878f", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "LP2000 3216\n", - "Blackcat 1596\n", - "Scag Lrtp 1144\n", - "Fresno Cog Lrtp 1047\n", - "Kern Cog Lrtp 955\n", - "Sacog Lrtp 557\n", - "Madera Ctc Lrtp 300\n", - "Stancog Lrtp 240\n", - "Ambag Lrtp 227\n", - "Sbcag Lrtp 223\n", - "Slocog Lrtp 196\n", - "State Rail Plan 169\n", - "Scrtpa Lrtp 136\n", - "Tcag Lrtp 135\n", - "Mtc Lrtp 112\n", - "Bcag Lrtp 108\n", - "Sandag Lrtp 90\n", - "Sjcog Lrtp 87\n", - "Tmpo Lrtp 55\n", - "SB1 Feature Server 51\n", - "Mcagov Lrtp 26\n", - "Kcag Lrtp 2\n", + "Blackcat 1459\n", + "Scag Lrtp 660\n", + "LP2000 639\n", + "Sacog Lrtp 201\n", + "Madera Ctc Lrtp 199\n", + "Fresno Cog Lrtp 181\n", + "State Rail Plan 161\n", + "Mtc Lrtp 107\n", + "Sbcag Lrtp 95\n", + "Sjcog Lrtp 63\n", + "Sandag Lrtp 63\n", + "Scrtpa Lrtp 60\n", + "SB1 Feature Server 50\n", + "Stancog Lrtp 49\n", + "Kern Cog Lrtp 45\n", + "Slocog Lrtp 35\n", + "Tmpo Lrtp 31\n", + "Bcag Lrtp 26\n", + "Ambag Lrtp 26\n", + "Tcag Lrtp 25\n", + "Mcagov Lrtp 10\n", + "Kcag Lrtp 1\n", "Name: source, dtype: int64" ] }, - "execution_count": 146, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -1842,79 +1931,95 @@ "transit_df.source.value_counts()" ] }, + { + "cell_type": "markdown", + "id": "e349dfc3-bd19-45d9-af24-6998814a77a2", + "metadata": {}, + "source": [ + "#### Keywords that were picked up" + ] + }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 56, "id": "0723a569-2807-4cfe-a552-4259132ef40a", "metadata": {}, "outputs": [], "source": [ - "def count_categories(df:pd.DataFrame, column:str):\n", + "def count_categories(df: pd.DataFrame, column: str):\n", " # Convert the result to a DataFrame\n", - " filtered_df = df.loc[df[column] != 'keyword not found'][[column]].value_counts()\n", - " result_df = pd.DataFrame(filtered_df, columns=['Count'])\n", + " filtered_df = df.loc[df[column] != \"keyword not found\"][[column]].value_counts()\n", + " result_df = pd.DataFrame(filtered_df, columns=[\"Count\"])\n", "\n", " # Reset the index to make the keyword a regular column\n", " result_df = result_df.reset_index()\n", "\n", " # Rename the columns if needed\n", - " result_df.columns = ['Keyword', 'Count']\n", + " result_df.columns = [\"Keyword\", \"Count\"]\n", " return result_df" ] }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 57, "id": "9d34f03f-0685-496b-816f-4a435e75f56c", "metadata": {}, "outputs": [], "source": [ - "proj_desc = count_categories(transit_df, 'lower_case_project_description_keyword_search')" + "proj_desc = count_categories(\n", + " transit_df, \"lower_case_project_description_keyword_search\"\n", + ")" ] }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 58, "id": "6f9c0470-a9b1-47fb-aa1b-10d864b3da34", "metadata": {}, "outputs": [], "source": [ - "title = count_categories(transit_df, 'lower_case_project_title_keyword_search')" + "title = count_categories(transit_df, \"lower_case_project_title_keyword_search\")" ] }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 59, "id": "c31bfc4d-7acc-402b-a22f-0f9c5bb74b90", "metadata": {}, "outputs": [], "source": [ - "categories = pd.merge(proj_desc, title, on = 'Keyword', how = 'outer', indicator = True)" + "categories = pd.merge(proj_desc, title, on=\"Keyword\", how=\"outer\", indicator=True)" ] }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 60, "id": "30c575f5-3cee-4f84-86ec-c491564c120b", "metadata": {}, "outputs": [], "source": [ - "categories['Total Projects'] = categories.Count_x.fillna(0) + categories.Count_y.fillna(0)" + "categories[\"Total Projects\"] = categories.Count_x.fillna(0) + categories.Count_y.fillna(\n", + " 0\n", + ")" ] }, { "cell_type": "code", - "execution_count": 152, + "execution_count": 61, "id": "5789e0cc-751c-4471-919e-6321cf3ff3fc", "metadata": {}, "outputs": [], "source": [ - "categories = categories.sort_values(by = ['Total Projects'], ascending = False).reset_index(drop = True).drop(columns = ['Count_x','Count_y','_merge'])" + "categories = (\n", + " categories.sort_values(by=[\"Total Projects\"], ascending=False)\n", + " .reset_index(drop=True)\n", + " .drop(columns=[\"Count_x\", \"Count_y\", \"_merge\"])\n", + ")" ] }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 62, "id": "f488ab7a-9a99-4890-ad9f-fc0e3209c5f7", "metadata": {}, "outputs": [ @@ -1946,362 +2051,162 @@ " \n", " \n", " 0\n", - " bike\n", - " 2771.00\n", + " bus\n", + " 1725.00\n", " \n", " \n", " 1\n", - " bus\n", - " 1578.00\n", + " transit\n", + " 959.00\n", " \n", " \n", " 2\n", - " pedestrian\n", - " 1297.00\n", + " van\n", + " 408.00\n", " \n", " \n", " 3\n", - " transit\n", - " 838.00\n", + " rail\n", + " 375.00\n", " \n", " \n", " 4\n", - " sidewalk\n", - " 719.00\n", + " buses\n", + " 259.00\n", " \n", " \n", " 5\n", - " trail\n", - " 533.00\n", + " station\n", + " 251.00\n", " \n", " \n", " 6\n", - " bicycle\n", - " 466.00\n", + " railroad\n", + " 209.00\n", " \n", " \n", " 7\n", - " signals\n", - " 453.00\n", + " paratransit\n", + " 86.00\n", " \n", " \n", " 8\n", - " van\n", - " 407.00\n", + " multimodal\n", + " 78.00\n", " \n", " \n", " 9\n", - " rail\n", - " 326.00\n", + " light rail\n", + " 73.00\n", " \n", " \n", " 10\n", - " buses\n", - " 240.00\n", + " ferry\n", + " 64.00\n", " \n", " \n", " 11\n", - " station\n", - " 212.00\n", + " intermodal\n", + " 41.00\n", " \n", " \n", " 12\n", - " transfer\n", - " 195.00\n", + " brt\n", + " 37.00\n", " \n", " \n", " 13\n", - " locations\n", - " 194.00\n", + " train\n", + " 34.00\n", " \n", " \n", " 14\n", - " railroad\n", - " 175.00\n", + " terminal\n", + " 31.00\n", " \n", " \n", " 15\n", - " facility\n", - " 164.00\n", + " commuter rail\n", + " 30.00\n", " \n", " \n", " 16\n", - " mobility management\n", - " 134.00\n", - " \n", - " \n", - " 17\n", - " crosswalk\n", - " 103.00\n", - " \n", - " \n", - " 18\n", - " ped\n", - " 99.00\n", - " \n", - " \n", - " 19\n", - " vehicles\n", - " 89.00\n", - " \n", - " \n", - " 20\n", - " paratransit\n", - " 83.00\n", - " \n", - " \n", - " 21\n", - " multimodal\n", - " 73.00\n", - " \n", - " \n", - " 22\n", - " rideshare\n", - " 67.00\n", - " \n", - " \n", - " 23\n", - " srts\n", - " 63.00\n", - " \n", - " \n", - " 24\n", - " light rail\n", - " 62.00\n", - " \n", - " \n", - " 25\n", - " ferry\n", - " 61.00\n", - " \n", - " \n", - " 26\n", - " routes\n", - " 60.00\n", - " \n", - " \n", - " 27\n", - " bus stop\n", - " 55.00\n", - " \n", - " \n", - " 28\n", - " congestion\n", - " 53.00\n", - " \n", - " \n", - " 29\n", - " bicycle lane\n", - " 49.00\n", - " \n", - " \n", - " 30\n", - " safe routes to school\n", - " 43.00\n", - " \n", - " \n", - " 31\n", - " metro\n", - " 42.00\n", - " \n", - " \n", - " 32\n", - " intermodal\n", - " 41.00\n", - " \n", - " \n", - " 33\n", - " brt\n", - " 34.00\n", - " \n", - " \n", - " 34\n", - " atp\n", - " 33.00\n", - " \n", - " \n", - " 35\n", - " train\n", - " 31.00\n", - " \n", - " \n", - " 36\n", - " park and ride\n", - " 29.00\n", - " \n", - " \n", - " 37\n", " express bus\n", " 29.00\n", " \n", " \n", - " 38\n", - " commuter rail\n", - " 29.00\n", - " \n", - " \n", - " 39\n", - " vanpool\n", - " 22.00\n", + " 17\n", + " trains\n", + " 20.00\n", " \n", " \n", - " 40\n", + " 18\n", " trolley\n", " 18.00\n", " \n", " \n", - " 41\n", + " 19\n", " shuttle\n", - " 16.00\n", + " 17.00\n", " \n", " \n", - " 42\n", + " 20\n", " locomotives\n", - " 15.00\n", + " 16.00\n", " \n", " \n", - " 43\n", - " rail network\n", + " 21\n", + " vans\n", " 15.00\n", " \n", " \n", - " 44\n", - " transit agency\n", - " 14.00\n", - " \n", - " \n", - " 45\n", + " 22\n", " mobility hub\n", - " 13.00\n", - " \n", - " \n", - " 46\n", - " bus shelter\n", - " 13.00\n", - " \n", - " \n", - " 47\n", - " fare\n", - " 11.00\n", - " \n", - " \n", - " 48\n", - " bus rapid transit\n", - " 9.00\n", - " \n", - " \n", - " 49\n", - " transit signal priority\n", - " 8.00\n", + " 14.00\n", " \n", " \n", - " 50\n", + " 23\n", " fare collection\n", - " 7.00\n", - " \n", - " \n", - " 51\n", - " bus route\n", - " 6.00\n", - " \n", - " \n", - " 52\n", - " light rail vehicles\n", - " 5.00\n", - " \n", - " \n", - " 53\n", - " turnouts\n", - " 5.00\n", - " \n", - " \n", - " 54\n", - " commute\n", - " 5.00\n", - " \n", - " \n", - " 55\n", - " yarts\n", - " 4.00\n", - " \n", - " \n", - " 56\n", - " pedestrain\n", - " 4.00\n", - " \n", - " \n", - " 57\n", - " rider\n", - " 3.00\n", + " 8.00\n", " \n", " \n", - " 58\n", + " 24\n", " integrated transit\n", - " 3.00\n", - " \n", - " \n", - " 59\n", - " vessels\n", - " 3.00\n", + " 4.00\n", " \n", " \n", - " 60\n", - " public transportation\n", + " 25\n", + " trolleys\n", " 2.00\n", " \n", " \n", - " 61\n", + " 26\n", " fare payment\n", " 2.00\n", " \n", " \n", - " 62\n", - " trolleys\n", - " 2.00\n", - " \n", - " \n", - " 63\n", - " bus network\n", - " 1.00\n", - " \n", - " \n", - " 64\n", + " 27\n", " farebox\n", " 1.00\n", " \n", " \n", - " 65\n", - " emus\n", - " 1.00\n", - " \n", - " \n", - " 66\n", - " cyclist\n", - " 1.00\n", - " \n", - " \n", - " 67\n", - " traffic reduction\n", - " 1.00\n", - " \n", - " \n", - " 68\n", - " automated transit\n", + " 28\n", + " fare technology\n", " 1.00\n", " \n", " \n", - " 69\n", - " ridesharing\n", + " 29\n", + " shuttles\n", " 1.00\n", " \n", " \n", - " 70\n", + " 30\n", " mobility as a service\n", " 1.00\n", " \n", " \n", - " 71\n", - " car share\n", + " 31\n", + " automated transit\n", " 1.00\n", " \n", " \n", @@ -2309,82 +2214,42 @@ "" ], "text/plain": [ - " Keyword Total Projects\n", - "0 bike 2771.00\n", - "1 bus 1578.00\n", - "2 pedestrian 1297.00\n", - "3 transit 838.00\n", - "4 sidewalk 719.00\n", - "5 trail 533.00\n", - "6 bicycle 466.00\n", - "7 signals 453.00\n", - "8 van 407.00\n", - "9 rail 326.00\n", - "10 buses 240.00\n", - "11 station 212.00\n", - "12 transfer 195.00\n", - "13 locations 194.00\n", - "14 railroad 175.00\n", - "15 facility 164.00\n", - "16 mobility management 134.00\n", - "17 crosswalk 103.00\n", - "18 ped 99.00\n", - "19 vehicles 89.00\n", - "20 paratransit 83.00\n", - "21 multimodal 73.00\n", - "22 rideshare 67.00\n", - "23 srts 63.00\n", - "24 light rail 62.00\n", - "25 ferry 61.00\n", - "26 routes 60.00\n", - "27 bus stop 55.00\n", - "28 congestion 53.00\n", - "29 bicycle lane 49.00\n", - "30 safe routes to school 43.00\n", - "31 metro 42.00\n", - "32 intermodal 41.00\n", - "33 brt 34.00\n", - "34 atp 33.00\n", - "35 train 31.00\n", - "36 park and ride 29.00\n", - "37 express bus 29.00\n", - "38 commuter rail 29.00\n", - "39 vanpool 22.00\n", - "40 trolley 18.00\n", - "41 shuttle 16.00\n", - "42 locomotives 15.00\n", - "43 rail network 15.00\n", - "44 transit agency 14.00\n", - "45 mobility hub 13.00\n", - "46 bus shelter 13.00\n", - "47 fare 11.00\n", - "48 bus rapid transit 9.00\n", - "49 transit signal priority 8.00\n", - "50 fare collection 7.00\n", - "51 bus route 6.00\n", - "52 light rail vehicles 5.00\n", - "53 turnouts 5.00\n", - "54 commute 5.00\n", - "55 yarts 4.00\n", - "56 pedestrain 4.00\n", - "57 rider 3.00\n", - "58 integrated transit 3.00\n", - "59 vessels 3.00\n", - "60 public transportation 2.00\n", - "61 fare payment 2.00\n", - "62 trolleys 2.00\n", - "63 bus network 1.00\n", - "64 farebox 1.00\n", - "65 emus 1.00\n", - "66 cyclist 1.00\n", - "67 traffic reduction 1.00\n", - "68 automated transit 1.00\n", - "69 ridesharing 1.00\n", - "70 mobility as a service 1.00\n", - "71 car share 1.00" + " Keyword Total Projects\n", + "0 bus 1725.00\n", + "1 transit 959.00\n", + "2 van 408.00\n", + "3 rail 375.00\n", + "4 buses 259.00\n", + "5 station 251.00\n", + "6 railroad 209.00\n", + "7 paratransit 86.00\n", + "8 multimodal 78.00\n", + "9 light rail 73.00\n", + "10 ferry 64.00\n", + "11 intermodal 41.00\n", + "12 brt 37.00\n", + "13 train 34.00\n", + "14 terminal 31.00\n", + "15 commuter rail 30.00\n", + "16 express bus 29.00\n", + "17 trains 20.00\n", + "18 trolley 18.00\n", + "19 shuttle 17.00\n", + "20 locomotives 16.00\n", + "21 vans 15.00\n", + "22 mobility hub 14.00\n", + "23 fare collection 8.00\n", + "24 integrated transit 4.00\n", + "25 trolleys 2.00\n", + "26 fare payment 2.00\n", + "27 farebox 1.00\n", + "28 fare technology 1.00\n", + "29 shuttles 1.00\n", + "30 mobility as a service 1.00\n", + "31 automated transit 1.00" ] }, - "execution_count": 153, + "execution_count": 62, "metadata": {}, "output_type": "execute_result" }