add analysis on mean sample position

sappelhoff · Jan 31, 2022 · 59fbe99 · 59fbe99
1 parent 4f6987a
commit 59fbe99
Showing 1 changed file with 295 additions and 0 deletions.
diff --git a/analysis_behavior.ipynb b/analysis_behavior.ipynb
@@ -544,6 +544,301 @@
     "print(f\"mode {scipy.stats.mode(df_nsamples['n_samples'])[0][0]:.1f}\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Compute mean sample position for each number / probability"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# NOTE: We'll need to drop all trials with only one sample for this analysis\n",
+    "# ... because we cannot tell whether the \"relative\" sample position of a single sample is 0 or 1.\n",
+    "(\n",
+    "    df[df[\"task\"].isin([\"AF\", \"AV\"])].drop_duplicates([\"subject\", \"task\", \"trial\"])[\n",
+    "        \"n_samples\"\n",
+    "    ]\n",
+    "    < 2\n",
+    ").sum()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute mean positions of outcomes\n",
+    "_tasks = [\"AF\", \"AV\"]\n",
+    "_dfs = []\n",
+    "for sub in np.unique(df[\"subject\"]):\n",
+    "    for i, task in enumerate(_tasks):\n",
+    "\n",
+    "        if task_not_present_for_subject(sub, task):\n",
+    "            continue\n",
+    "\n",
+    "        _data = df[\n",
+    "            (df[\"task\"] == task) & (df[\"n_samples\"] > 1) & (df[\"subject\"] == sub)\n",
+    "        ][[\"n_samples\", \"sample\", \"outcome\"]]\n",
+    "        arr = _data.to_numpy(dtype=float)\n",
+    "\n",
+    "        # normalize sample position by sample length\n",
+    "        # z = (x - x.min) / (x.max - x.min)\n",
+    "        # note: samples are zero-indexed, so +1 first\n",
+    "        arr[:, 1] += 1\n",
+    "        arr[:, 1] = (arr[:, 1] - 1) / (arr[:, 0] - 1)\n",
+    "\n",
+    "        _df = pd.DataFrame(arr[:, 1:], columns=[\"sample_pos_norm\", \"value\"])\n",
+    "        _df[\"control\"] = {\"AF\": \"partial\", \"AV\": \"full\"}[task]\n",
+    "        _df[\"subject\"] = sub\n",
+    "        _dfs.append(_df)\n",
+    "\n",
+    "df_values = pd.concat(_dfs)\n",
+    "df_values = df_values.astype({\"value\": int})\n",
+    "\n",
+    "# form means in each subj\n",
+    "df_values = df_values.groupby([\"subject\", \"control\", \"value\"]).mean().reset_index()\n",
+    "df_values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# For each task, trial, option, outcome ...\n",
+    "# --> compute the \"probability\" (frequency) of that outcome occurring\n",
+    "data = df[df[\"task\"].isin([\"AF\", \"AV\"])][\n",
+    "    [\"subject\", \"task\", \"trial\", \"n_samples\", \"sample\", \"action\", \"outcome\"]\n",
+    "]\n",
+    "data = data[data[\"n_samples\"] > 1]\n",
+    "\n",
+    "# go through each subj / task / trial separately\n",
+    "grps = []\n",
+    "for meta, grp in data.groupby([\"subject\", \"task\", \"trial\"]):\n",
+    "\n",
+    "    # go through each option (left/right) separately\n",
+    "    grp = grp.copy()\n",
+    "    grp[\"outcome_freq\"] = 0.0\n",
+    "    for _meta, _grp in grp.groupby(\"action\"):\n",
+    "        # calculate \"experienced probability\" (frequency)\n",
+    "        # specific for the option (left/right)\n",
+    "        arr = _grp[\"outcome\"].value_counts().reset_index().to_numpy(dtype=float)\n",
+    "        arr[:, -1] = arr[:, -1] / len(_grp)\n",
+    "        assert arr[:, 1].sum() == 1\n",
+    "\n",
+    "        # add the calculated frequencies to the df\n",
+    "        def _insert_val(row, arr=arr):\n",
+    "            out = row[\"outcome\"]\n",
+    "            freq = row[\"outcome_freq\"]\n",
+    "            if out in arr[:, 0]:\n",
+    "                freq = arr[:, 1][np.nonzero(arr[:, 0] == out)[0][0]]\n",
+    "            return freq\n",
+    "\n",
+    "        grp[\"outcome_freq\"] = grp.apply(lambda row: _insert_val(row), axis=1)\n",
+    "\n",
+    "    grps.append(grp)\n",
+    "\n",
+    "_data = pd.concat(grps)\n",
+    "\n",
+    "# sanity check that we only added a frequency column without changing anything else\n",
+    "pd.testing.assert_frame_equal(\n",
+    "    data,\n",
+    "    _data[[\"subject\", \"task\", \"trial\", \"n_samples\", \"sample\", \"action\", \"outcome\"]],\n",
+    ")\n",
+    "data = _data.copy()\n",
+    "del _data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# bin the frequencies into nbins evenly spaced bins\n",
+    "# e.g., nbins=5 --> array([0.  , 0.25, 0.5 , 0.75, 1.  ])\n",
+    "# if right=False:\n",
+    "# bin 1 = 0-0.24999999\n",
+    "# bin 2 = 0.25-0.4999999\n",
+    "# bin 3 = 0.5-0.7499999999\n",
+    "# bin 4 = 0.75-0.9999999\n",
+    "# bin 5 = 1.0-infinity\n",
+    "#\n",
+    "# if right=True:\n",
+    "# bin 1 = 0-0.25\n",
+    "# bin 2 = 0.251-0.5\n",
+    "# bin 3 = 0.51-0.75\n",
+    "# bin 4 = 0.751-1.0\n",
+    "# bin 5 = empty\n",
+    "nbins = 6\n",
+    "bins = np.linspace(0, 1, nbins)\n",
+    "\n",
+    "data[\"outcome_freq_bin\"] = np.digitize(data[\"outcome_freq\"], bins=bins, right=False)\n",
+    "\n",
+    "datacopy = data.copy()\n",
+    "datacopy[\"outcome_freq_bin\"] = np.digitize(\n",
+    "    datacopy[\"outcome_freq\"], bins=bins, right=True\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# sanity check bins (see above)\n",
+    "assert data[\"outcome_freq\"].min() > 0\n",
+    "assert np.allclose(\n",
+    "    data[data[\"outcome_freq_bin\"] == nbins][\"outcome_freq\"].to_numpy(), 1.0\n",
+    ")\n",
+    "assert len(datacopy[datacopy[\"outcome_freq_bin\"] == nbins]) == 0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Which one to pick? right open or closed?\n",
+    "data = datacopy"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# compute mean positions of freq bins\n",
+    "_tasks = [\"AF\", \"AV\"]\n",
+    "_dfs = []\n",
+    "for sub in np.unique(df[\"subject\"]):\n",
+    "    for i, task in enumerate(_tasks):\n",
+    "\n",
+    "        if task_not_present_for_subject(sub, task):\n",
+    "            continue\n",
+    "\n",
+    "        _data = data[(data[\"task\"] == task) & (data[\"subject\"] == sub)][\n",
+    "            [\"n_samples\", \"sample\", \"outcome_freq_bin\"]\n",
+    "        ]\n",
+    "        arr = _data.to_numpy(dtype=float)\n",
+    "\n",
+    "        # normalize sample position by sample length\n",
+    "        # z = (x - x.min) / (x.max - x.min)\n",
+    "        # note: samples are zero-indexed, so +1 first\n",
+    "        arr[:, 1] += 1\n",
+    "        arr[:, 1] = (arr[:, 1] - 1) / (arr[:, 0] - 1)\n",
+    "\n",
+    "        _df = pd.DataFrame(arr[:, 1:], columns=[\"sample_pos_norm\", \"freq_bin\"])\n",
+    "        _df[\"control\"] = {\"AF\": \"partial\", \"AV\": \"full\"}[task]\n",
+    "        _df[\"subject\"] = sub\n",
+    "        _dfs.append(_df)\n",
+    "\n",
+    "df_freqs = pd.concat(_dfs)\n",
+    "df_freqs = df_freqs.astype({\"freq_bin\": int})\n",
+    "\n",
+    "# form means in each subj\n",
+    "# NOTE: subj 20 doesn't have samples with low sample_pos_norm (i.e., empty first bin)\n",
+    "df_freqs = df_freqs.groupby([\"subject\", \"control\", \"freq_bin\"]).mean().reset_index()\n",
+    "df_freqs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with sns.plotting_context(\"talk\"):\n",
+    "    fig, axs = plt.subplots(1, 2, figsize=(8, 4))\n",
+    "    ax1, ax2 = axs\n",
+    "\n",
+    "    ci = 68\n",
+    "    scale = 0.75\n",
+    "    dodge = True\n",
+    "    sns.pointplot(\n",
+    "        data=df_values.groupby([\"subject\", \"control\", \"value\"])\n",
+    "        .mean()\n",
+    "        .reset_index(),  # df_values,\n",
+    "        x=\"value\",\n",
+    "        y=\"sample_pos_norm\",\n",
+    "        hue=\"control\",\n",
+    "        ci=ci,\n",
+    "        ax=ax1,\n",
+    "        scale=scale,\n",
+    "        dodge=dodge,\n",
+    "    )\n",
+    "    sns.pointplot(\n",
+    "        data=df_freqs,\n",
+    "        x=\"freq_bin\",\n",
+    "        y=\"sample_pos_norm\",\n",
+    "        hue=\"control\",\n",
+    "        ci=ci,\n",
+    "        ax=ax2,\n",
+    "        scale=scale,\n",
+    "        dodge=dodge,\n",
+    "    )\n",
+    "\n",
+    "    ax1.axhline(0.5, c=\"black\", lw=0.5, ls=\"--\", zorder=0)\n",
+    "    ax2.axhline(0.5, c=\"black\", lw=0.5, ls=\"--\", zorder=0)\n",
+    "\n",
+    "    ax1.set(ylim=(0.4, 0.6), xlabel=\"sample value\")\n",
+    "    ax2.set(\n",
+    "        ylim=(0.4, 0.6),\n",
+    "        xlabel=\"frequency (binned)\\nwithin option\",\n",
+    "        # xticklabels=[f\"{_:.1f}\" for _ in bins[1:]],\n",
+    "        xticklabels=[\"0.1\", \"0.3\", \"0.5\", \"0.7\", \"0.9\"],\n",
+    "    )\n",
+    "    ax1.set_ylabel(\"mean sample position\\n(normalized by trial length)\")\n",
+    "    ax2.get_legend().remove()\n",
+    "    ax2.set_ylabel(\"mean sample position\\n(normalized by trial length)\")\n",
+    "    ax1.legend(frameon=False, title=None)\n",
+    "    ax2.legend(frameon=False, title=None)\n",
+    "\n",
+    "\n",
+    "fig.tight_layout()\n",
+    "sns.despine(fig)\n",
+    "fig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# slope of \"sample value\"\n",
+    "for control in [\"partial\", \"full\"]:\n",
+    "    _d = df_values[df_values[\"control\"] == control]\n",
+    "    val = _d[\"value\"].to_numpy()\n",
+    "    pos = _d[\"sample_pos_norm\"].to_numpy()\n",
+    "    print(control)\n",
+    "    display(pingouin.regression.linear_regression(X=val, y=pos).round(4))\n",
+    "    print(\"------\\n\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grid = sns.lmplot(\n",
+    "    x=\"value\",\n",
+    "    y=\"sample_pos_norm\",\n",
+    "    col=\"control\",\n",
+    "    x_jitter=0.25,\n",
+    "    y_jitter=0.025,\n",
+    "    data=df_values,\n",
+    "    line_kws={\"color\": \"red\"},\n",
+    ")\n",
+    "grid.fig.suptitle(\"sample values\\nindividual dots are jittered\", y=1.1)\n",
+    "grid.fig"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},