Skip to content

Commit

Permalink
python stuff
Browse files Browse the repository at this point in the history
  • Loading branch information
nofurtherinformation committed Aug 26, 2024
1 parent 7acac92 commit 34c0b59
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 15 deletions.
4 changes: 3 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -36,4 +36,6 @@ yarn-error.log*
.env
package.json
public/data/___*
public/data/archive
public/data/archive
scripts/__pycache__/utils.cpython-39.pyc
.env copy
7 changes: 6 additions & 1 deletion scripts/correlation_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,12 @@
# %%
current_dir = path.dirname(path.abspath(__file__))
data_dir = path.join(current_dir, '..', 'public', 'data')
df_full = get_full_data()
# df_full = get_full_data()
df= pd.read_parquet(path.join(data_dir, 'full_.parquet'))
# %%
df = df.replace(-666666666.0, np.nan)
# %%
df.to_parquet(path.join(data_dir, 'full_tract.parquet'))
# %%
corr_df = df_full.rename(columns=corr_columns_dict)[corr_columns]
corr_df.replace(-666666666.0, np.nan, inplace=True)
Expand Down
16 changes: 16 additions & 0 deletions scripts/export_formats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
# %%
import pandas as pd

# pd remove col limit
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', None)
# %%
df_full = pd.read_parquet("../public/data/full_tract.parquet")
# %%
# export to common formats
df_full.to_csv("../public/data/full_tract.csv", index=False)
# excel
df_full.to_excel("../public/data/full_tract.xlsx", index=False)


# %%
9 changes: 5 additions & 4 deletions scripts/precompute_summary_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
current_dir = path.dirname(path.abspath(__file__))
data_dir = path.join(current_dir, '..', 'public', 'data')
# %%
df_full = pd.read_parquet(path.join(data_dir, 'full_tract.parquet'))
df_full = pd.read_parquet(path.join(data_dir, 'full_tract.parquet'))

# %%
year = '2023'
Expand Down Expand Up @@ -143,6 +143,7 @@ def columnarize_msgpack(data, id_col, filepath, cols, compress=False):
data_min[id] = []
for col in cols:
data_min[id].append(values[col])
print(data_min)
try:
write_msgpack(data_min, filepath, compress)
except Exception as e:
Expand Down Expand Up @@ -405,10 +406,10 @@ def get_full_data(path, column_dict, out_cols=["GEOID"]):
)
# %%
us_demog = pd.read_parquet(path.join(data_dir, 'demography_us.parquet'))
us_demog['GEOID'] = 'us'
us_demog['GEOID'] = 1
us_demog['UNIT'] = "nation"
us_demog["UNIT_PLURAL"] = "nation"
# %%
data_dir = path.join(current_dir, '..', 'public', 'data')
columnarize_msgpack(us_demog.to_dict(orient="records"), 'GEOID', path.join(data_dir, 'national', f'1.msgpack'), list(us_demog.columns), compress=True)
out_dir = path.join(current_dir, '..', 'public', 'data', 'summary', 'national')
columnarize_msgpack(us_demog.to_dict(orient="records"), 'GEOID', path.join(data_dir, f'1.msgpack'), list(us_demog.columns), compress=True)
# %%
8 changes: 8 additions & 0 deletions scripts/tercile_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,14 @@
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', None)
# %%
df_full = pd.read_parquet("../public/data/full_tract.parquet")
matches = pd.read_csv('../../rafi-data-viz-data/data/divest_geoids.csv')
# %%
matches['GEOID_left'] = matches['GEOID_left'].astype(str).str.zfill(11)
# %%
divest_tracts = df_full[df_full.GEOID.isin(matches.GEOID_left)]
# %%

# %%
# acp stats
acp_stats_df = df_full[['community', 'TOTAL_POPULATION','NH WHITE ALONE','NH BLACK ALONE','NH AMERICAN INDIAN ALONE','HISPANIC OR LATINO']]
Expand Down
18 changes: 9 additions & 9 deletions scripts/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,15 +133,15 @@ def get_full_data():
properties = [{'geoid': row['id'], **row['properties']} for row in data]
acp_data = pd.DataFrame(properties)
df_full = pd.read_parquet(path.join(data_dir, 'full_tract.parquet'))
tract_info = pd.read_parquet(path.join(data_dir, 'tracts_info.parquet'))
df_full = df_full.merge(tract_info, on='GEOID')
df_full['COUMTY'] = df_full['GEOID'].str.slice(0, 5)
df_full = df_full.merge(acp_data, left_on='COUMTY', right_on='geoid')
df_full['ALAND'] = pd.to_numeric(df_full['ALAND'])
df_full['DENSITY'] = df_full['TOTAL_POPULATION'] / df_full['ALAND']
# sf to sq mi
df_full['ALAND'] = pd.to_numeric(df_full['ALAND'])/ 2589988.11
df_full['DENSITY'] = df_full['TOTAL_POPULATION'] / df_full['ALAND']
# tract_info = pd.read_parquet(path.join(data_dir, 'tracts_info.parquet'))
# df_full = df_full.merge(tract_info, on='GEOID')
# df_full['COUMTY'] = df_full['GEOID'].str.slice(0, 5)
# df_full = df_full.merge(acp_data, left_on='COUMTY', right_on='geoid')
# df_full['ALAND'] = pd.to_numeric(df_full['ALAND'])
# df_full['DENSITY'] = df_full['TOTAL_POPULATION'] / df_full['ALAND']
# # sf to sq mi
# df_full['ALAND'] = pd.to_numeric(df_full['ALAND'])/ 2589988.11
# df_full['DENSITY'] = df_full['TOTAL_POPULATION'] / df_full['ALAND']
# replace -666666666.0 with None
df_full = df_full.replace(-666666666.0, None)
rurality = pd.read_excel(path.join(data_dir, 'Urban Rural Classification 2013.xlsx'))[[
Expand Down

0 comments on commit 34c0b59

Please sign in to comment.