From 38595f8ab10482aa8bc7fde484af4c129da0cb8e Mon Sep 17 00:00:00 2001 From: Shing Chan Date: Fri, 10 Nov 2023 19:43:09 +0000 Subject: [PATCH] feat(classif): cutpoint model; rm spurious sleep - Extend cut-point-based model. - Resolve spurious sleep - now for all models. --- src/accelerometer/accPlot.py | 2 +- src/accelerometer/accProcess.py | 19 ++-- src/accelerometer/classification.py | 155 ++++++++++++++++++++-------- src/accelerometer/summarisation.py | 12 +-- 4 files changed, 127 insertions(+), 61 deletions(-) diff --git a/src/accelerometer/accPlot.py b/src/accelerometer/accPlot.py index ade06975..c1a145b2 100644 --- a/src/accelerometer/accPlot.py +++ b/src/accelerometer/accPlot.py @@ -31,7 +31,7 @@ 'bicycling': 'springgreen', 'tasks-light': 'darkorange', 'SB': 'red', # sedentary behaviour - 'LIPA': 'darkorange', # light physical activity + 'LPA': 'darkorange', # light physical activity 'MVPA': 'green', # moderate-vigorous physical activity 'MPA': 'green', # moderate physical activity 'VPA': 'springgreen', # vigorous physical activity diff --git a/src/accelerometer/accProcess.py b/src/accelerometer/accProcess.py index 727cd778..7c4ccdb2 100644 --- a/src/accelerometer/accProcess.py +++ b/src/accelerometer/accProcess.py @@ -149,12 +149,16 @@ def main(): # noqa: C901 help="""calibration sphere threshold (default : %(default)s mg))""") # activity parameters - parser.add_argument('--mgCutPointMVPA', + parser.add_argument('--mgCpLPA', + metavar="mg", default=45, type=int, + help="""LPA threshold for cut point based activity + definition (default : %(default)s)""") + parser.add_argument('--mgCpMPA', metavar="mg", default=100, type=int, - help="""MVPA threshold for cut point based activity - definition (default : %(default)s)""") - parser.add_argument('--mgCutPointVPA', - metavar="mg", default=425, type=int, + help="""MPA threshold for cut point based activity + definition (default : %(default)s)""") + parser.add_argument('--mgCpVPA', + metavar="mg", default=400, type=int, help="""VPA threshold for cut point based activity definition (default : %(default)s)""") parser.add_argument('--intensityDistribution', @@ -328,8 +332,9 @@ def deleteIntermediateFiles(): activityClassification=args.activityClassification, timeZone=args.timeZone, startTime=args.startTime, endTime=args.endTime, epochPeriod=args.epochPeriod, - stationaryStd=args.stationaryStd, mgCutPointMVPA=args.mgCutPointMVPA, - mgCutPointVPA=args.mgCutPointVPA, activityModel=args.activityModel, + stationaryStd=args.stationaryStd, + mgCpLPA=args.mgCpLPA, mgCpMPA=args.mgCpMPA, mgCpVPA=args.mgCpVPA, + activityModel=args.activityModel, intensityDistribution=args.intensityDistribution, psd=args.psd, fourierFrequency=args.fourierFrequency, fourierWithAcc=args.fourierWithAcc, m10l5=args.m10l5) diff --git a/src/accelerometer/classification.py b/src/accelerometer/classification.py index b2e154ed..ac813547 100644 --- a/src/accelerometer/classification.py +++ b/src/accelerometer/classification.py @@ -20,7 +20,13 @@ import json -def activityClassification(epoch, activityModel="walmsley"): +def activityClassification( + epoch, + activityModel: str = "walmsley", + mgCpLPA: int = 45, + mgCpMPA: int = 100, + mgCpVPA: int = 400 +): """Perform classification of activity states from epoch feature data Based on a balanced random forest with a Hidden Markov Model containing @@ -40,70 +46,66 @@ def activityClassification(epoch, activityModel="walmsley"): :rtype: list(str) """ - use_cutpoints = 'chan' in activityModel - smooth_sleep = 'chan' in activityModel - - activityModel = resolveModelPath(activityModel) + modelPath = resolveModelPath(activityModel) - featureCols = joblib.load(getFileFromTar(activityModel, 'featureCols')) - model = joblib.load(getFileFromTar(activityModel, 'model')) - hmmParams = joblib.load(getFileFromTar(activityModel, 'hmmParams')) - labels = joblib.load(getFileFromTar(activityModel, 'labels')).tolist() + featureCols = joblib.load(getFileFromTar(modelPath, 'featureCols')) + model = joblib.load(getFileFromTar(modelPath, 'model')) + hmmParams = joblib.load(getFileFromTar(modelPath, 'hmmParams')) + labels = joblib.load(getFileFromTar(modelPath, 'labels')).tolist() X = epoch[featureCols].to_numpy() ok = np.isfinite(X).all(axis=1) print(f"{len(epoch) - np.sum(ok)} rows with NaN or Inf values, out of {len(epoch)}") - Y = viterbi(model.predict(X[ok]), hmmParams) - - if smooth_sleep: - sleep = pd.Series(Y == 'sleep') - sleep_streak = ( - sleep.ne(sleep.shift()) - .cumsum() - .pipe(lambda x: x.groupby(x).transform('count') * sleep) - ) - # TODO: hardcoded 120 = 1hr - Y[(Y == 'sleep') & (sleep_streak < 120)] = 'sedentary' + Y = pd.Series(index=epoch.index) + Y.loc[ok] = viterbi(model.predict(X[ok]), hmmParams) - if use_cutpoints: + # TODO: Chan's logic hardcoded here + if activityModel == 'chan': enmo = epoch['enmoTrunc'].to_numpy() - enmo = enmo[mask] - Y[(Y == 'other') & (enmo < .1)] = 'light' - Y[(Y == 'other') & (enmo >= .1)] = 'moderate-vigorous' + other = (Y == 'other') + Y.loc[other & (enmo < 100/1000)] = 'light' + Y.loc[other & (enmo >= 100/1000)] = 'moderate' + Y.loc[other & (enmo > 400/1000)] = 'vigorous' labels.remove('other') labels.append('light') - labels.append('moderate-vigorous') + labels.append('moderate') + labels.append('vigorous') + del enmo + del other - # Append predicted activities to epoch dataframe - epoch["label"] = np.nan - epoch.loc[ok, "label"] = Y + Y = removeSpuriousSleep(Y, activityModel=activityModel) + + # One-hot encoding + epoch.loc[ok, labels] = (Y[ok].to_numpy()[:, None] == labels).astype('float') # MET prediction - METs = joblib.load(getFileFromTar(activityModel, 'METs')) + METs = joblib.load(getFileFromTar(modelPath, 'METs')) if METs is not None: - epoch["MET"] = epoch["label"].replace(METs) + epoch.loc[:, "MET"] = Y.replace(METs) - # One-hot encoding - for lab in labels: - epoch[lab] = 0 - epoch.loc[epoch['label'] == lab, lab] = 1 - # Null values aren't one-hot encoded, so set such instances to NaN - for lab in labels: - epoch.loc[epoch[labels].sum(axis=1) == 0, lab] = np.nan + # Cut-point based classification on non-sleep epochs + YCpOneHot = cutPointModel( + epoch['enmoTrunc'], + cuts={'LPA': mgCpLPA/1000, 'MPA': mgCpMPA/1000, 'VPA': mgCpVPA/1000}, + whr=~(Y == 'sleep') # Note: ~(Y == 'sleep') != (Y != 'sleep') because of NaNs + ) + epoch = epoch.join(YCpOneHot) + labelsCp = list(YCpOneHot.columns) + labels.extend(labelsCp) return epoch, labels def trainClassificationModel( - trainingFile, - labelCol="label", participantCol="participant", - annotationCol="annotation", metCol="MET", - featuresTxt="activityModels/features.txt", - nTrees=1000, maxDepth=None, minSamplesLeaf=1, - cv=None, testParticipants=None, - outDir='model/', - nJobs=1, + trainingFile, + labelCol="label", participantCol="participant", + annotationCol="annotation", metCol="MET", + featuresTxt="activityModels/features.txt", + nTrees=1000, maxDepth=None, minSamplesLeaf=1, + cv=None, testParticipants=None, + outDir='model/', + nJobs=1, ): """Train model to classify activity states from epoch feature data @@ -334,6 +336,69 @@ def log(x): return viterbi_path +def removeSpuriousSleep(Y, activityModel='walmsley', sleepTol='1H'): + """ Remove spurious sleep epochs from activity classification + + :param Series Y: Model output + :param str activityModel: Model identifier + :param str sleepTol: Minimum sleep duration, e.g. '1H' + + :return: Dataframe of revised model output + :rtype: pandas.DataFrame + """ + + newValue = { + 'willetts': 'sit-stand', + 'doherty': 'sedentary', + 'walmsley': 'sedentary', + 'chan': 'sedentary', + }[activityModel] + + sleep = Y == 'sleep' + sleepStreak = ( + sleep.ne(sleep.shift()) + .cumsum() + .pipe(lambda x: x.groupby(x).transform('count') * sleep) + ) + sleepTol = pd.Timedelta(sleepTol) / Y.index.freq + whr = sleep & (sleepStreak < sleepTol) + Y = Y.copy() # no modify original + Y.loc[whr] = newValue + + return Y + + +def cutPointModel(enmo, cuts=None, whr=None): + """Perform classification of activities based on cutpoints. + + :param Series enmo: Timeseries of ENMO. + :param dict cuts: Dictionary of cutpoints for each activity. + + :return: Activity labels. + :rtype: pandas.Series + """ + + if cuts is None: + # default cutpoints + cuts = {'LPA': 45/1000, 'MPA': 100/1000, 'VPA': 400/1000} + + if whr is None: + whr = pd.Series(True, index=enmo.index) + + Y = pd.DataFrame(index=enmo.index, columns=['CpSB', 'CpLPA', 'CpMPA', 'CpVPA', 'CpMVPA']) + + Y.loc[:, 'CpSB'] = (enmo <= cuts['LPA']) & whr + Y.loc[:, 'CpLPA'] = (enmo > cuts['LPA']) & (enmo <= cuts['MPA']) & whr + Y.loc[:, 'CpMPA'] = (enmo > cuts['MPA']) & (enmo <= cuts['VPA']) & whr + Y.loc[:, 'CpVPA'] = (enmo > cuts['VPA']) & whr + Y.loc[:, 'CpMVPA'] = (enmo > cuts['MPA']) & whr + + Y.loc[enmo.isna()] = np.nan + Y = Y.astype('float') + + return Y + + def perParticipantSummaryHTML(dfParam, yTrueCol, yPredCol, pidCol, outHTML): """Provide HTML summary of how well activity classification model works at the per-participant level diff --git a/src/accelerometer/summarisation.py b/src/accelerometer/summarisation.py index 99eda93d..778bbafe 100644 --- a/src/accelerometer/summarisation.py +++ b/src/accelerometer/summarisation.py @@ -15,7 +15,7 @@ def getActivitySummary( # noqa: C901 activityClassification=True, timeZone='Europe/London', startTime=None, endTime=None, epochPeriod=30, stationaryStd=13, minNonWearDuration=60, - mgCutPointMVPA=100, mgCutPointVPA=425, + mgCpLPA=45, mgCpMPA=100, mgCpVPA=400, activityModel="walmsley", intensityDistribution=False, imputation=True, psd=False, fourierFrequency=False, fourierWithAcc=False, m10l5=False @@ -109,10 +109,6 @@ def getActivitySummary( # noqa: C901 # enmoTrunc = max(enmo, 0) data['acc'] = data['enmoTrunc'] * 1000 # convert enmoTrunc to milli-G units - # Cut-point based MVPA and VPA - data['cutPointMVPA'] = data['acc'] >= mgCutPointMVPA - data['cutPointVPA'] = data['acc'] >= mgCutPointVPA - # Resolve interrupts data = resolveInterrupts(data, epochPeriod, summary) # Resolve non-wear @@ -121,7 +117,7 @@ def getActivitySummary( # noqa: C901 # Predict activity from features, and add label column labels = [] if activityClassification: - data, labels = classification.activityClassification(data, activityModel) + data, labels = classification.activityClassification(data, activityModel, mgCpLPA, mgCpMPA, mgCpVPA) # Calculate empirical cumulative distribution function of vector magnitudes if intensityDistribution: @@ -355,7 +351,7 @@ def writeMovementSummaries(data, labels, summary): # noqa: C901 # Hours of activity for each recorded day epochInHours = pd.Timedelta(freq).total_seconds() / 3600 - cols = ['wearTime', 'cutPointMVPA', 'cutPointVPA'] + labels + cols = ['wearTime'] + labels dailyStats = ( data[cols].astype('float') .groupby(data.index.date) @@ -370,7 +366,7 @@ def writeMovementSummaries(data, labels, summary): # noqa: C901 # In the following, we resample, pad and impute the data so that we have a # multiple of 24h for the stats calculations tStart, tEnd = data.index[0], data.index[-1] - cols = ['acc', 'wearTime', 'cutPointMVPA', 'cutPointVPA'] + labels + cols = ['acc', 'wearTime'] + labels if 'MET' in data.columns: cols.append('MET') data = imputeMissing(data[cols].astype('float'))