-
Notifications
You must be signed in to change notification settings - Fork 0
/
MiscCalc.py
115 lines (98 loc) · 3.79 KB
/
MiscCalc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 6 15:34:58 2018
@author: deborahkhider
Misc: Standard ms misc calculations
"""
#%% Number of essential/recommended/desired questions
import pandas as pd
import numpy as np
# Read the Excel file
xl = pd.ExcelFile('./Paleoclimate Data Standards Concatenated Reponses.xlsx')
# Get the names of the excel sheets
sheetNames = xl.sheet_names
#Open them one by one and count
count = 0
for sheetName in sheetNames:
data = xl.parse(sheetName)
count = count + data.shape[0]
#%% Percentage of questions answered as recommended/essential/desired
import pandas as pd
import numpy as np
# Read the Excel file
xl = pd.ExcelFile('./Paleoclimate Data Standards Concatenated Reponses.xlsx')
# Get the names of the excel sheets
sheetNames = xl.sheet_names
# Broad categories: Cross-archive, Archive-Specific, Uncertainties, Chronologies
# Arrays contain the count for Essential, Recommended, Desired in that order
new_archiveType =[]
legacy_archiveType = []
new_rec = []
legacy_rec = []
choice = ["essential","recommended","desired"]
for sheetName in sheetNames:
data = xl.parse(sheetName)
# Get the type of archive from the sheet name
if "_" in sheetName:
n = sheetName.split("_")
archiveType = n[0]
else:
archiveType = sheetName
# Flip through each property and determined if it's essential, recommended or desired
for index in np.arange(0,data.shape[0]):
# Gather the responses per platform and replace NaN by zero
twitter = np.nan_to_num(np.array([data['Essential Twitter'][index], data['Recommended Twitter'][index],\
data['Desired Twitter'][index]])) #Twitter responses
wiki = np.nan_to_num(np.array([data['Essential Wiki'][index], data['Recommended Wiki'][index],\
data['Desired Wiki'][index]])) #Wiki responses
survey = np.nan_to_num(np.array([data['Essential Survey'][index], data['Recommended Survey'][index],\
data['Desired Survey'][index]])) #Survey responses
# add them up
total = twitter+wiki+survey
# pick the final answer
max_val = np.max(total)
if max_val == 0:
ans ='desired'
else:
idx = np.argmax(total)
ans = choice[idx]
# Update the recommendation + archiveType counters
if 'new' in data['Dataset status'][index]:
new_archiveType.append(archiveType)
new_rec.append(ans)
else:
legacy_archiveType.append(archiveType)
legacy_rec.append(ans)
# Concatenate the answers into panda dataframes
new_dataframe = pd.DataFrame({'WG':new_archiveType,'rec':new_rec})
legacy_dataframe = pd.DataFrame({'WG':legacy_archiveType,'rec':legacy_rec})
# Calculate the number of essential/recommended/desired across all platforms
new_rec = new_dataframe.groupby('rec').size()
leg_rec = legacy_dataframe.groupby('rec').size()
# Print out the recommendations:
print('New datasets Recommendation: ')
for item in choice:
print(item+": "+str(new_rec[item]))
print(item+": "+str(new_rec[item]/new_dataframe.shape[0]*100)+"%")
print('Legacy datasets Recommendation: ')
for item in choice:
print(item+": "+str(leg_rec[item]))
print(item+": "+str(leg_rec[item]/legacy_dataframe.shape[0]*100)+"%")
# Number of questions from each working group
new_dataframe.groupby('rec').size()
legacy_dataframe.groupby('rec').size()
# Number of answers from each WG
WGs = new_dataframe['WG'].unique()
print('New Datasets: ')
for WG in WGs:
# filter
temp = new_dataframe.loc[new_dataframe['WG']==WG]
print(WG)
print(temp.groupby('rec').size())
print('legacy Datasets: ')
for WG in WGs:
# filter
temp = legacy_dataframe.loc[legacy_dataframe['WG']==WG]
print(WG)
print(temp.groupby('rec').size())