-
Notifications
You must be signed in to change notification settings - Fork 0
/
FinalprojectinvolvingUSwaterqualityA_to_pandas.py
111 lines (89 loc) · 3.4 KB
/
FinalprojectinvolvingUSwaterqualityA_to_pandas.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#%%
## Goals: To compare different areas and contaminant levels, illustrate areas, contaminant levels over time.
import pandas as pd
import folium
import re
#make sure to install ipyleaflet
#https://catalog.data.gov/dataset/water-quality-data-0de37
pd.set_option('display.max_columns', None)
field_results = pd.read_csv(r"C:\Users\amcfa\gitfiles\Projects\MastersWork\FundamentalssofDataVisualizzations\Water quality data\field_results.csv", low_memory=False)
period_of_record = pd.read_csv(r"C:\Users\amcfa\gitfiles\Projects\MastersWork\FundamentalssofDataVisualizzations\Water quality data\period_of_record.csv",low_memory=False)
stations =pd.read_csv(r"C:\Users\amcfa\gitfiles\Projects\MastersWork\FundamentalssofDataVisualizzations\Water quality data\stations.csv",low_memory=False)
lab_results = pd.read_csv(r"C:\Users\amcfa\gitfiles\Projects\MastersWork\FundamentalssofDataVisualizzations\Water quality data\lab_results.csv",low_memory=False)
a = lab_results[['latitude','longitude','station_number','sample_date','parameter','result','reporting_limit', 'units']]
b = stations[['latitude','longitude', 'station_number']]
c = field_results[['latitude','longitude','station_number','full_station_name','sample_date','parameter','fdr_result', 'fdr_reporting_limit','uns_name']]
d = period_of_record[['latitude','longitude','station_number','sample_date_max','sample_date_min']]
# %%
a = a.replace('"', '')
b = b.replace('"', '')
c = c.replace('"', '')
c = d.replace('"', '')
#%%
#%%
p = a['sample_date'] = pd.to_datetime(a['sample_date'])
q = a['new_date'] = a['sample_date'].dt.date
# %%
q
#%%
lab_results
stations
period_of_record
# %%
period_of_record
# %%
lab_results
# %%
stations
# %%
stations.sample(n=10)
# %%
stations.describe()
# %%
## Results grouped by area sounds like a good place to start maaybe results limits, lat/long
a = lab_results.select([pl.col('latitude'), pl.col('longitude'),pl.col('station_number'),pl.col('sample_date'),pl.col('parameter'),pl.col('result'), pl.col('reporting_limit'), pl.col('units')])
# %%
b = stations.select([pl.col('latitude'), pl.col('longitude'), pl.col('station_number')])
# %%
c = field_results.select([pl.col('latitude'),pl.col('longitude'),pl.col('station_number'),pl.col('full_station_name'),pl.col('sample_date'),pl.col('parameter'),pl.col('fdr_result'), pl.col('fdr_reporting_limit'), pl.col('uns_name')])
# %%
# %%
d = period_of_record.select([pl.col('latitude'),pl.col('longitude'),pl.col('station_number'),pl.col('sample_date_max'),pl.col('sample_date_min')])
# %%
##
# %%
a
# %%
b
# %%
c
# %%
d
# %%
m = folium.Map(location = [39.272938,-121.16])
m
# %%
# Here is where I would use some code to iterate over my dfs and merge the data onto the map
#%%
# So this just kinda made a mess, it just looks
(a.select([
"latitude",
"longitude",
"parameter",
pl.col("parameter").sort_by("latitude").alias("parameters_sorted"),
pl.col("parameter").sort_by("longitude").alias("parameters_sorted_long"),
]))
# %%
# looks like I need to start earlier in the data. By merging the data frames first.
# %%
(a.select([
"station_number",
"sample_date",
"parameter",
pl.col("parameter").sort_by("station_number").alias("parameters_sorted"),
pl.col("parameter").sort_by("station_number").alias("parameters_sorted_long"),
]))
# %%
# %%
pattern = r""
re.split(,)