From 34c0b59ff362959bcb9a13d3e047539f651c135d Mon Sep 17 00:00:00 2001 From: nofurtherinformation Date: Mon, 26 Aug 2024 09:30:06 -0500 Subject: [PATCH] python stuff --- .gitignore | 4 +++- scripts/correlation_analysis.py | 7 ++++++- scripts/export_formats.py | 16 ++++++++++++++++ scripts/precompute_summary_stats.py | 9 +++++---- scripts/tercile_analysis.py | 8 ++++++++ scripts/utils.py | 18 +++++++++--------- 6 files changed, 47 insertions(+), 15 deletions(-) create mode 100644 scripts/export_formats.py diff --git a/.gitignore b/.gitignore index 4e0a2bc..42cc5d4 100644 --- a/.gitignore +++ b/.gitignore @@ -36,4 +36,6 @@ yarn-error.log* .env package.json public/data/___* -public/data/archive \ No newline at end of file +public/data/archive +scripts/__pycache__/utils.cpython-39.pyc +.env copy diff --git a/scripts/correlation_analysis.py b/scripts/correlation_analysis.py index 89b62a9..f3d4dc7 100644 --- a/scripts/correlation_analysis.py +++ b/scripts/correlation_analysis.py @@ -12,7 +12,12 @@ # %% current_dir = path.dirname(path.abspath(__file__)) data_dir = path.join(current_dir, '..', 'public', 'data') -df_full = get_full_data() +# df_full = get_full_data() +df= pd.read_parquet(path.join(data_dir, 'full_.parquet')) +# %% +df = df.replace(-666666666.0, np.nan) +# %% +df.to_parquet(path.join(data_dir, 'full_tract.parquet')) # %% corr_df = df_full.rename(columns=corr_columns_dict)[corr_columns] corr_df.replace(-666666666.0, np.nan, inplace=True) diff --git a/scripts/export_formats.py b/scripts/export_formats.py new file mode 100644 index 0000000..cb4a03a --- /dev/null +++ b/scripts/export_formats.py @@ -0,0 +1,16 @@ +# %% +import pandas as pd + +# pd remove col limit +pd.set_option('display.float_format', lambda x: '%.3f' % x) +pd.set_option('display.max_columns', None) +# %% +df_full = pd.read_parquet("../public/data/full_tract.parquet") +# %% +# export to common formats +df_full.to_csv("../public/data/full_tract.csv", index=False) +# excel +df_full.to_excel("../public/data/full_tract.xlsx", index=False) + + +# %% diff --git a/scripts/precompute_summary_stats.py b/scripts/precompute_summary_stats.py index 04637f3..1525dd2 100644 --- a/scripts/precompute_summary_stats.py +++ b/scripts/precompute_summary_stats.py @@ -11,7 +11,7 @@ current_dir = path.dirname(path.abspath(__file__)) data_dir = path.join(current_dir, '..', 'public', 'data') # %% -df_full = pd.read_parquet(path.join(data_dir, 'full_tract.parquet')) +df_full = pd.read_parquet(path.join(data_dir, 'full_tract.parquet')) # %% year = '2023' @@ -143,6 +143,7 @@ def columnarize_msgpack(data, id_col, filepath, cols, compress=False): data_min[id] = [] for col in cols: data_min[id].append(values[col]) + print(data_min) try: write_msgpack(data_min, filepath, compress) except Exception as e: @@ -405,10 +406,10 @@ def get_full_data(path, column_dict, out_cols=["GEOID"]): ) # %% us_demog = pd.read_parquet(path.join(data_dir, 'demography_us.parquet')) -us_demog['GEOID'] = 'us' +us_demog['GEOID'] = 1 us_demog['UNIT'] = "nation" us_demog["UNIT_PLURAL"] = "nation" # %% -data_dir = path.join(current_dir, '..', 'public', 'data') -columnarize_msgpack(us_demog.to_dict(orient="records"), 'GEOID', path.join(data_dir, 'national', f'1.msgpack'), list(us_demog.columns), compress=True) +out_dir = path.join(current_dir, '..', 'public', 'data', 'summary', 'national') +columnarize_msgpack(us_demog.to_dict(orient="records"), 'GEOID', path.join(data_dir, f'1.msgpack'), list(us_demog.columns), compress=True) # %% diff --git a/scripts/tercile_analysis.py b/scripts/tercile_analysis.py index ec2f50a..738a798 100644 --- a/scripts/tercile_analysis.py +++ b/scripts/tercile_analysis.py @@ -12,6 +12,14 @@ pd.set_option('display.float_format', lambda x: '%.3f' % x) pd.set_option('display.max_columns', None) # %% +df_full = pd.read_parquet("../public/data/full_tract.parquet") +matches = pd.read_csv('../../rafi-data-viz-data/data/divest_geoids.csv') +# %% +matches['GEOID_left'] = matches['GEOID_left'].astype(str).str.zfill(11) +# %% +divest_tracts = df_full[df_full.GEOID.isin(matches.GEOID_left)] +# %% + # %% # acp stats acp_stats_df = df_full[['community', 'TOTAL_POPULATION','NH WHITE ALONE','NH BLACK ALONE','NH AMERICAN INDIAN ALONE','HISPANIC OR LATINO']] diff --git a/scripts/utils.py b/scripts/utils.py index 59fd613..5ed7a3e 100644 --- a/scripts/utils.py +++ b/scripts/utils.py @@ -133,15 +133,15 @@ def get_full_data(): properties = [{'geoid': row['id'], **row['properties']} for row in data] acp_data = pd.DataFrame(properties) df_full = pd.read_parquet(path.join(data_dir, 'full_tract.parquet')) - tract_info = pd.read_parquet(path.join(data_dir, 'tracts_info.parquet')) - df_full = df_full.merge(tract_info, on='GEOID') - df_full['COUMTY'] = df_full['GEOID'].str.slice(0, 5) - df_full = df_full.merge(acp_data, left_on='COUMTY', right_on='geoid') - df_full['ALAND'] = pd.to_numeric(df_full['ALAND']) - df_full['DENSITY'] = df_full['TOTAL_POPULATION'] / df_full['ALAND'] - # sf to sq mi - df_full['ALAND'] = pd.to_numeric(df_full['ALAND'])/ 2589988.11 - df_full['DENSITY'] = df_full['TOTAL_POPULATION'] / df_full['ALAND'] + # tract_info = pd.read_parquet(path.join(data_dir, 'tracts_info.parquet')) + # df_full = df_full.merge(tract_info, on='GEOID') + # df_full['COUMTY'] = df_full['GEOID'].str.slice(0, 5) + # df_full = df_full.merge(acp_data, left_on='COUMTY', right_on='geoid') + # df_full['ALAND'] = pd.to_numeric(df_full['ALAND']) + # df_full['DENSITY'] = df_full['TOTAL_POPULATION'] / df_full['ALAND'] + # # sf to sq mi + # df_full['ALAND'] = pd.to_numeric(df_full['ALAND'])/ 2589988.11 + # df_full['DENSITY'] = df_full['TOTAL_POPULATION'] / df_full['ALAND'] # replace -666666666.0 with None df_full = df_full.replace(-666666666.0, None) rurality = pd.read_excel(path.join(data_dir, 'Urban Rural Classification 2013.xlsx'))[[