Skip to content

Commit

Permalink
File reading revamp
Browse files Browse the repository at this point in the history
File reading now uses the high-level pandas library instead of numpy. This allows intelligent processing of any number of masses for each element. A bug that some files were read without the last column was fixed
  • Loading branch information
yz3062 committed Jan 20, 2020
1 parent 0759996 commit 1bb3190
Showing 1 changed file with 36 additions and 41 deletions.
77 changes: 36 additions & 41 deletions PaxsPy.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
matplotlib.use("TkAgg")
import matplotlib.pyplot as plt

spike_answer = str(raw_input("Are you using 2006-2 UTh spike and 2019-1b Pa spike? If not, click no and search \'MixedPa' in script and change its values. [y] or n:") or 'y')
spike_answer = str(raw_input("Are you using 2006-2 UTh spike and 2019-2a Pa spike? If not, click no and search \'MixedPa' in script and change its values. [y] or n:") or 'y')
if spike_answer == 'n':
sys.exit()
figure_answer = str(raw_input("Do you want to inspect ICPMS raw output in figures?[y] or n:") or 'y')
Expand All @@ -34,47 +34,41 @@
file_names = askopenfilenames(title="Select all the ICPMS output files and a \'sample_info' excel file") # show an "Open" dialog box and return the path to the selected file

def return_five_point_avg(file_name):
# start reading from row 12, which are name/unit/blank
# txt_handle = np.genfromtxt(file_name, delimiter='\t', skip_header=12)
txt_handle = np.genfromtxt(file_name, delimiter='\t')
if np.all(np.isnan(txt_handle[0])):# if first line is empty
txt_handle = txt_handle[12:]
else:
txt_handle = txt_handle[7:]
def return_five_point_avg(file_name):
# read txt as csv, using tab as separator
txt_handle = pd.read_csv(file_name,sep='\t',header=None)
txt_handle.dropna(how='all',axis='index',inplace=True) # drop the rows where all elements are missing
txt_handle.dropna(how='all',axis='columns',inplace=True) # drop the columns where all elements are missing
txt_handle.reset_index(drop=True,inplace=True) # index at this point doesn't start with 0,1,2 etc because some rows were deleted. This will reset index, and drop=True will prevent a new column named "index" be created
txt_handle.drop([0,1,2],inplace=True) # remove the first three rows
txt_handle.reset_index(drop=True,inplace=True)
txt_handle = txt_handle.astype(float)
if figure_answer == 'y':
txt_handle_r = np.transpose(txt_handle)
data_df = pd.DataFrame(data=txt_handle_r[1:-1,:])
data_df.plot(sharex=True,title=file_name)
#plt.ion()
#plt.show()
txt_handle_r = txt_handle.transpose() # create a transposed version
txt_handle_r.columns = txt_handle_r.iloc[0] # name the columns with the first row (mass)
txt_handle_r.drop(txt_handle_r.index[0],inplace=True) # drop the first row (mass)
txt_handle_r.plot(sharex=True,title=file_name)
plt.savefig(file_name+'.png')
# get rid of first column (mass) and last column (nan)
txt_handle = txt_handle[:,1:-1]
# If not blank, check and remove outliers
# if 'Blank' not in file_name and 'blank' not in file_name:
txt_handle.set_index(txt_handle[0],inplace=True) # set index as mass
txt_handle.drop(columns=[0],inplace=True) # drop the mass column
txt_handle = reject_outliers(txt_handle)
# average accros five points
five_point_avg = ma.mean(txt_handle.reshape(len(txt_handle)/5, 5, -1),axis=1)
# A second check for outliers after the five point average, except when the file is Blank
# if 'Blank' not in file_name and 'blank' not in file_name:
# print file_name + " # outliers: " + str(ma.count_masked(five_point_avg))
# return reject_outliers(five_point_avg)
# else:
# return five_point_avg
five_point_avg_2nd_outlier_search = reject_outliers(five_point_avg)
print file_name + " # outliers: " + str(ma.count_masked(five_point_avg_2nd_outlier_search))
return five_point_avg_2nd_outlier_search

# average accros multiple masses of the same element
txt_handle.set_index(np.floor(txt_handle.index),inplace=True)
five_point_avg = txt_handle.groupby(txt_handle.index).mean()
five_point_avg_2nd_outlier_detection = reject_outliers(five_point_avg)
masekd_array = ma.masked_invalid(five_point_avg_2nd_outlier_detection.values)
print(file_name + ' # outliers: ' + str(np.count_nonzero(~np.isnan(masekd_array))))
return masekd_array

def reject_outliers(data, m = 2.):
# from https://stackoverflow.com/questions/11686720/is-there-a-numpy-builtin-to-reject-outliers-from-a-list
# median approach
d = ma.abs(data - ma.median(data, axis=1)[:,None])
mdev = ma.median(d,axis=1)
s = d/mdev[:,None]# if mdev!=0 else 0.
# # avg approach
# d = np.abs(data - np.mean(data,axis=1)[:,None])
# s = d/np.mean(data,axis=1)[:,None]
return ma.array(data,mask=np.greater(s,m))
'''
from https://stackoverflow.com/questions/11686720/is-there-a-numpy-builtin-to-reject-outliers-from-a-list
data is expected to be a pandas dataframe, where each row is a number of measurements on the same mass
'''
d = data.subtract(data.median(axis=1),axis='index').abs()
mdev = d.median(axis=1)
s = d.divide(mdev,axis='index')
return data.mask(s>m)

#%% process blanks and stds. Calculate tailcrxn slope and intercept
names = [name for name in file_names if 'Blank' in name or 'blank' in name or
Expand Down Expand Up @@ -247,8 +241,8 @@ def reject_outliers(data, m = 2.):
avg_233Pa_231Pa = ma.mean(MixPa_233231_avg)
avg_233Pa_231Pa_RSD = ma.sqrt(ma.sum((ma.array(MixPa_233231_avg) * ma.array(MixPa_233231_RSD))**2))/3/avg_233Pa_231Pa
Pa231_cncn_pg_g = 38
Pa231_soln_mass_in_spike_g = 0.1054
Pa233_soln_mass_in_spike_g = 12.7247
Pa231_soln_mass_in_spike_g = 0.2103
Pa233_soln_mass_in_spike_g = 26.6125
Pa233_mass_in_spike_pg_g = avg_233Pa_231Pa*Pa231_cncn_pg_g*Pa231_soln_mass_in_spike_g/Pa233_soln_mass_in_spike_g
decay_days = int(input('Enter the number of days from Pa233 spike preparation to Pa clean up column: '))

Expand Down Expand Up @@ -302,7 +296,8 @@ def reject_outliers(data, m = 2.):
else:
avg_233Pa_231Pa_df = pd.DataFrame({'avg_233Pa_231Pa for Marty':[decay_days,avg_233Pa_231Pa]},index=[0,1])
export_df = pd.concat([sample_name_df,export_data_df,avg_233Pa_231Pa_df],axis=1)
#%% save to csv

#%% save to excel
output_file_name = asksaveasfilename(title='Save the output file as')
if 'xlsx' not in output_file_name:
output_file_name = output_file_name + '.xlsx'
Expand Down

0 comments on commit 1bb3190

Please sign in to comment.