new main stats

mwegrzyn · Nov 5, 2018 · 31b2713 · 31b2713
1 parent b2cc7d7
commit 31b2713
Show file tree

Hide file tree

Showing 17 changed files with 4,920 additions and 480 deletions.
diff --git a/CFMT/output/cambridge_main_results.csv b/CFMT/output/cambridge_main_results.csv
@@ -0,0 +1,4 @@
+,,% correct,percentile,t,df,p
+Face recognition,CMFT 1,100,72,0.59,26,0.561
+Face recognition,CMFT 2,37,0,-3.40,26,0.002
+Face recognition,CMFT 3,33,2,-2.25,26,0.033
diff --git a/modules/case_stats/__init__.py b/modules/case_stats/__init__.py
@@ -0,0 +1,9 @@
+"""Compute statistics for comparing individual against group"""
+
+__version__ = '0.0.1'
+__author__ = 'Martin Wegrzyn'
+
+# import modules objects
+from .percentile_scores import *
+from .ttest_single import *
+from .norm_stats import *
diff --git a/modules/case_stats/norm_stats.py b/modules/case_stats/norm_stats.py
@@ -0,0 +1,50 @@
+"""Function to make output dataframe"""
+
+# import required modules
+import numpy as np
+from scipy import stats
+from .percentile_scores import *
+from .ttest_single import *
+
+
+def make_percentiles(group_mean, group_std, pat_mean):
+
+    group_array = np.random.normal(loc = group_mean, scale = group_std, size = 1000000)
+    group_array = group_array.reshape(-1,1)
+    pat_array = np.array([pat_mean]).reshape(-1,1)
+    percentile_score = get_percentiles(group_array,pat_array)
+
+    return percentile_score
+
+
+def make_tstats_df(df,pat_idx):
+
+    # get summary statistics of control group
+    group_df = df.drop(pat_idx).apply(['mean','std','count'])
+    # get raw data of patient
+    pat_df = df.loc[[pat_idx]]
+    # hard-code the patient name to be (upper-case) 'JB'
+    pat_df.index = ['JB']
+    # combine
+    stats_df = pd.concat([group_df,pat_df])
+
+    # initialize dictionary to write results to
+    d = {}
+    # for each task/feature
+    for c in stats_df.columns:
+        # get the data necessary to compute a ttest
+        pat_mean = stats_df.loc['JB',c]
+        group_mean = stats_df.loc['mean',c]
+        group_std = stats_df.loc['std',c]
+        group_n = stats_df.loc['count',c]
+        # compute the ttest
+        t,df,p = ttest_single(pat_mean,group_mean,group_std,group_n)
+        # add to dictionary
+        d[c] = {'t':'%.2f'%t,'df':int(df),'p':'%.3f'%p}
+
+    # turn dict into df
+    ttest_df = pd.DataFrame(d).T
+    # custom order
+    ttest_df = ttest_df.loc[:,['t','df','p']]
+
+    return ttest_df
diff --git a/modules/case_stats/percentile_scores.py b/modules/case_stats/percentile_scores.py
@@ -0,0 +1,86 @@
+"""Functions for computing percentile scores"""
+
+# import required modules
+import pandas as pd
+import numpy as np
+from scipy import stats
+from sklearn import preprocessing
+
+
+# get z scores and transform to percentiles
+def get_percentiles(con_data,pat_data):
+    """Transform the patient's raw score into a percentile score
+
+    This function uses the z-distribution and then transforms
+    the z-values into percentile scores
+
+    Parameters
+    ----------
+    con_data : (n,1) shaped numpy array
+        Can be a list, array or pandas-series of values
+        All the data of the controls
+    pat_data : (1,1) shaped numpy array
+        data of the patient
+
+    Returns
+    -------
+    
+    the percentile score of the patient
+
+    """
+    # get transformation parameters from controls
+    my_scaler = preprocessing.StandardScaler()
+    my_scaler.fit(con_data)
+    # apply to patient to get the z-score
+    z = my_scaler.transform(pat_data)[-1]
+    # transform z-scores to cumulative distribution
+    cdf = stats.norm.cdf(z)[-1]
+    # scale from 0 to 100 instead of 0 to 1
+    percentile = cdf*100
+    # get z out of array
+    z = z[-1]
+    return z, percentile
+
+
+# apply to each column of a df
+def make_percentile_df(df,pat_idx):
+    """compare percentiles for all columns of a df
+    
+    Applies the get_percentiles function to all columns of a dataframe    
+
+    Parameters
+    ----------
+    df : pandas dataframe
+        table with values
+
+    Returns
+    -------
+    pc_df : pandas dataframe
+        table with percentiles of patient
+    """
+
+    # make a new dataframe
+    pc_df = pd.DataFrame()
+    # for each column of the input dataframe
+    for c in df.columns:
+        # this will only work for columns with all values being numbers
+
+        # transform values to percentile scores
+        this_df = df.loc[:,c]
+        con_data = this_df.drop(pat_idx).values
+        pat_data = this_df.loc[pat_idx]
+
+        con_data = con_data.reshape(-1,1)
+        pat_data = np.array([pat_data]).reshape(-1,1)
+
+        con_data.shape,pat_data.shape
+        z, pc = get_percentiles(con_data, pat_data)
+
+        d = {c:{'z':z,'percentile':pc}}
+        # transform values to dataframe
+        this_pc = pd.DataFrame(d).T
+        # add to big dataframe
+        pc_df = pd.concat([pc_df,this_pc],axis=0)
+
+
+    return pc_df
diff --git a/modules/case_stats/ttest_single.py b/modules/case_stats/ttest_single.py
@@ -0,0 +1,99 @@
+"""Functions for computing t-tests"""
+
+# import required modules
+import numpy as np
+from scipy import stats
+
+# main function
+def ttest_single(x_pat,x_group,s_group,n):
+
+    """Compare one person with a sample of multiple persons
+
+    The formula to compute the t-value is as follows::
+    
+        (x_pat - x_group) / (s_group * sqrt( ( n+1 )/n ) ) 
+
+    If the t-value obtained is negative and its magnitude exceeds the one-tailed
+    5% critical value for t on n-1 degrees of freedom,
+    then it can be concluded that the patient's score is sufficiently
+    low to enable rejection of the null hypothesis that it is an observation
+    from the scores of the control population
+    (the patient is therefore considered to exhibit an impairment on the task in question)
+
+    Parameters
+    ----------
+    x_pat : float or int
+        Patient's score
+    x_group : float or int
+        Mean of the scores in the control sample
+    s_group : float or int
+        Standard deviation of scores in the control sample
+    n : int
+        The size of the control sample
+
+    Returns
+    -------
+    t : float
+        t-value for the comparison of patient with controls
+    df : int
+        Degrees of freedom, computed as n-1
+    p : float
+        two-tailed p-value for t 
+
+    Notes
+    -----
+    This function applies the formula described in 
+    
+        Crawford, J. R., Garthwaite, P. H., & Howell, D. C. (2009). 
+        On comparing a single case with a control sample: An alternative perspective.
+        Neuropsychologia, 47(13), 2690-2695.
+
+    Examples
+    --------
+    Say that the patient and the group have equal means
+    and the samle size is 100. Then the following results
+    should be produced:
+        t-value is zero (no difference at all)
+        df's are 99 (sample size of 100 minus 1)
+        p-value is one (not significant at all)
+
+    >>> 't=%.10f, df=%.1f, p=%.10f'%ttest_single(10,10,1,100)
+    't=0.0000000000, df=99.0, p=1.0000000000'
+    
+    Say that the patient is two standard deviations below
+    the controls. Then, - given a large enough sample - 
+    the result should be significant:
+    
+    >>> 't=%.10f, df=%.1f, p=%.10f'%ttest_single(8,10,1,100)
+    't=-1.9900743804, df=99.0, p=0.0493404337'
+    
+    In too small a sample, the difference will not be significant
+    
+    >>> 't=%.10f, df=%.1f, p=%.10f'%ttest_single(8,10,1,10)
+    't=-1.9069251785, df=9.0, p=0.0888979325'
+    
+    Nor will a smaller difference be significant in a large sample
+    
+    >>> 't=%.10f, df=%.1f, p=%.10f'%ttest_single(8.1,10,1,100)
+    't=-1.8905706614, df=99.0, p=0.0616060466'
+    """
+
+    # numerator of the formula 
+    num = ( float(x_pat) - x_group )
+    # denominator of the formula
+    delim = (s_group*np.sqrt( (n+1.0)/n) )
+
+    # t-value
+    t = num/delim
+
+    # degrees of freedom
+    df = n-1
+
+    # two-sided p-value (more conservative than the one-sided p suggested in the cited article)
+    p = stats.t.sf(np.abs(t), df)*2
+
+    return t,df,p
+
+# run doctests
+import doctest
+doctest.run_docstring_examples(ttest_single,globals(),verbose=False,name='ttest_single')