Skip to content

Commit

Permalink
new main stats
Browse files Browse the repository at this point in the history
  • Loading branch information
mwegrzyn committed Nov 5, 2018
1 parent b2cc7d7 commit 31b2713
Show file tree
Hide file tree
Showing 17 changed files with 4,920 additions and 480 deletions.
4 changes: 4 additions & 0 deletions CFMT/output/cambridge_main_results.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
,,% correct,percentile,t,df,p
Face recognition,CMFT 1,100,72,0.59,26,0.561
Face recognition,CMFT 2,37,0,-3.40,26,0.002
Face recognition,CMFT 3,33,2,-2.25,26,0.033
9 changes: 9 additions & 0 deletions modules/case_stats/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
"""Compute statistics for comparing individual against group"""

__version__ = '0.0.1'
__author__ = 'Martin Wegrzyn'

# import modules objects
from .percentile_scores import *
from .ttest_single import *
from .norm_stats import *
50 changes: 50 additions & 0 deletions modules/case_stats/norm_stats.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Function to make output dataframe"""

# import required modules
import numpy as np
from scipy import stats
from .percentile_scores import *
from .ttest_single import *


def make_percentiles(group_mean, group_std, pat_mean):

group_array = np.random.normal(loc = group_mean, scale = group_std, size = 1000000)
group_array = group_array.reshape(-1,1)
pat_array = np.array([pat_mean]).reshape(-1,1)
percentile_score = get_percentiles(group_array,pat_array)

return percentile_score


def make_tstats_df(df,pat_idx):

# get summary statistics of control group
group_df = df.drop(pat_idx).apply(['mean','std','count'])
# get raw data of patient
pat_df = df.loc[[pat_idx]]
# hard-code the patient name to be (upper-case) 'JB'
pat_df.index = ['JB']
# combine
stats_df = pd.concat([group_df,pat_df])

# initialize dictionary to write results to
d = {}
# for each task/feature
for c in stats_df.columns:
# get the data necessary to compute a ttest
pat_mean = stats_df.loc['JB',c]
group_mean = stats_df.loc['mean',c]
group_std = stats_df.loc['std',c]
group_n = stats_df.loc['count',c]
# compute the ttest
t,df,p = ttest_single(pat_mean,group_mean,group_std,group_n)
# add to dictionary
d[c] = {'t':'%.2f'%t,'df':int(df),'p':'%.3f'%p}

# turn dict into df
ttest_df = pd.DataFrame(d).T
# custom order
ttest_df = ttest_df.loc[:,['t','df','p']]

return ttest_df
86 changes: 86 additions & 0 deletions modules/case_stats/percentile_scores.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
"""Functions for computing percentile scores"""

# import required modules
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing


# get z scores and transform to percentiles
def get_percentiles(con_data,pat_data):
"""Transform the patient's raw score into a percentile score
This function uses the z-distribution and then transforms
the z-values into percentile scores
Parameters
----------
con_data : (n,1) shaped numpy array
Can be a list, array or pandas-series of values
All the data of the controls
pat_data : (1,1) shaped numpy array
data of the patient
Returns
-------
the percentile score of the patient
"""
# get transformation parameters from controls
my_scaler = preprocessing.StandardScaler()
my_scaler.fit(con_data)
# apply to patient to get the z-score
z = my_scaler.transform(pat_data)[-1]
# transform z-scores to cumulative distribution
cdf = stats.norm.cdf(z)[-1]
# scale from 0 to 100 instead of 0 to 1
percentile = cdf*100
# get z out of array
z = z[-1]
return z, percentile


# apply to each column of a df
def make_percentile_df(df,pat_idx):
"""compare percentiles for all columns of a df
Applies the get_percentiles function to all columns of a dataframe
Parameters
----------
df : pandas dataframe
table with values
Returns
-------
pc_df : pandas dataframe
table with percentiles of patient
"""

# make a new dataframe
pc_df = pd.DataFrame()
# for each column of the input dataframe
for c in df.columns:
# this will only work for columns with all values being numbers

# transform values to percentile scores
this_df = df.loc[:,c]
con_data = this_df.drop(pat_idx).values
pat_data = this_df.loc[pat_idx]

con_data = con_data.reshape(-1,1)
pat_data = np.array([pat_data]).reshape(-1,1)

con_data.shape,pat_data.shape
z, pc = get_percentiles(con_data, pat_data)

d = {c:{'z':z,'percentile':pc}}
# transform values to dataframe
this_pc = pd.DataFrame(d).T
# add to big dataframe
pc_df = pd.concat([pc_df,this_pc],axis=0)


return pc_df
99 changes: 99 additions & 0 deletions modules/case_stats/ttest_single.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Functions for computing t-tests"""

# import required modules
import numpy as np
from scipy import stats

# main function
def ttest_single(x_pat,x_group,s_group,n):

"""Compare one person with a sample of multiple persons
The formula to compute the t-value is as follows::
(x_pat - x_group) / (s_group * sqrt( ( n+1 )/n ) )
If the t-value obtained is negative and its magnitude exceeds the one-tailed
5% critical value for t on n-1 degrees of freedom,
then it can be concluded that the patient's score is sufficiently
low to enable rejection of the null hypothesis that it is an observation
from the scores of the control population
(the patient is therefore considered to exhibit an impairment on the task in question)
Parameters
----------
x_pat : float or int
Patient's score
x_group : float or int
Mean of the scores in the control sample
s_group : float or int
Standard deviation of scores in the control sample
n : int
The size of the control sample
Returns
-------
t : float
t-value for the comparison of patient with controls
df : int
Degrees of freedom, computed as n-1
p : float
two-tailed p-value for t
Notes
-----
This function applies the formula described in
Crawford, J. R., Garthwaite, P. H., & Howell, D. C. (2009).
On comparing a single case with a control sample: An alternative perspective.
Neuropsychologia, 47(13), 2690-2695.
Examples
--------
Say that the patient and the group have equal means
and the samle size is 100. Then the following results
should be produced:
t-value is zero (no difference at all)
df's are 99 (sample size of 100 minus 1)
p-value is one (not significant at all)
>>> 't=%.10f, df=%.1f, p=%.10f'%ttest_single(10,10,1,100)
't=0.0000000000, df=99.0, p=1.0000000000'
Say that the patient is two standard deviations below
the controls. Then, - given a large enough sample -
the result should be significant:
>>> 't=%.10f, df=%.1f, p=%.10f'%ttest_single(8,10,1,100)
't=-1.9900743804, df=99.0, p=0.0493404337'
In too small a sample, the difference will not be significant
>>> 't=%.10f, df=%.1f, p=%.10f'%ttest_single(8,10,1,10)
't=-1.9069251785, df=9.0, p=0.0888979325'
Nor will a smaller difference be significant in a large sample
>>> 't=%.10f, df=%.1f, p=%.10f'%ttest_single(8.1,10,1,100)
't=-1.8905706614, df=99.0, p=0.0616060466'
"""

# numerator of the formula
num = ( float(x_pat) - x_group )
# denominator of the formula
delim = (s_group*np.sqrt( (n+1.0)/n) )

# t-value
t = num/delim

# degrees of freedom
df = n-1

# two-sided p-value (more conservative than the one-sided p suggested in the cited article)
p = stats.t.sf(np.abs(t), df)*2

return t,df,p

# run doctests
import doctest
doctest.run_docstring_examples(ttest_single,globals(),verbose=False,name='ttest_single')
Loading

0 comments on commit 31b2713

Please sign in to comment.