From d2b0b9aa67e8874b6c7494dbd0e8cfd77d818bd6 Mon Sep 17 00:00:00 2001 From: stephanbreimann Date: Mon, 1 Jul 2024 15:43:37 +0200 Subject: [PATCH] Update AAclust().filter_coverage() method documentation --- aaanalysis/feature_engineering/_aaclust.py | 15 +++-- .../aaclust_filter_coverage.ipynb | 58 ++++++++++++------- 2 files changed, 43 insertions(+), 30 deletions(-) diff --git a/aaanalysis/feature_engineering/_aaclust.py b/aaanalysis/feature_engineering/_aaclust.py index 07a4d817..d9152f11 100644 --- a/aaanalysis/feature_engineering/_aaclust.py +++ b/aaanalysis/feature_engineering/_aaclust.py @@ -621,14 +621,13 @@ def filter_coverage(self, """ Select a redundancy-reduced set of numerical scales with defined subcategory coverage. - This method reduces the number of numerical scales in the feature matrix `X` by clustering them. - It ensures that the selected clusters cover a minimum percentage (`min_coverage`) of unique subcategories - in `names_ref`. - - The process involves clustering the scales in `X` and selecting one scale per cluster. The initial number of - clusters is determined by the number of unique subcategories in `names_ref`. The number of clusters is increased - step-wise until the overlap (coverage) between the unique elements in `names_ref` and the subcategories of - the selected scales meets or exceeds the defined threshold (`min_coverage`). + This method reduces the number of numerical scales in the feature matrix ``X``, while + ensuring that the selected scales cover a minimum percentage (``min_coverage``) of subcategories. + + The process involves clustering the scales in ``X`` and selecting one scale per cluster. The initial number of + clusters is determined by the number of unique subcategories in ``names_ref``. The number of clusters is + increased step-wise until the overlap (coverage) between the unique elements in ``names_ref`` and the + subcategories of the selected scales meets a defined threshold (``min_coverage``). Parameters ---------- diff --git a/examples/feature_engineering/aaclust_filter_coverage.ipynb b/examples/feature_engineering/aaclust_filter_coverage.ipynb index f3eca8d0..e48578d9 100644 --- a/examples/feature_engineering/aaclust_filter_coverage.ipynb +++ b/examples/feature_engineering/aaclust_filter_coverage.ipynb @@ -12,8 +12,17 @@ }, { "cell_type": "code", - "execution_count": 6, - "outputs": [], + "execution_count": 18, + "outputs": [ + { + "data": { + "text/plain": "", + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 scale_idcategorysubcategoryscale_namescale_description
1LINS030110ASA/VolumeAccessible surface area (ASA)ASA (folded coil/turn)Total median ac...s et al., 2003)
2LINS030113ASA/VolumeAccessible surface area (ASA)ASA (folded coil/turn)% total accessi...s et al., 2003)
3JANJ780101ASA/VolumeAccessible surface area (ASA)ASA (folded protein)Average accessi...n et al., 1978)
4JANJ780103ASA/VolumeAccessible surface area (ASA)ASA (folded protein)Percentage of e...n et al., 1978)
5LINS030104ASA/VolumeAccessible surface area (ASA)ASA (folded protein)Total median ac...s et al., 2003)
\n" + }, + "metadata": {}, + "output_type": "display_data" + } + ], "source": [ "import aaanalysis as aa\n", "aa.options[\"verbose\"] = False\n", @@ -27,14 +36,16 @@ "df_cat = aa.load_scales(name=\"scales_cat\")\n", "names_ref = df_cat[df_cat[\"scale_id\"].isin(scale_ids)][\"subcategory\"].tolist()\n", "\n", - "# Create AAclu model\n", - "aac = aa.AAclust()" + "# Create AAclust model\n", + "aac = aa.AAclust()\n", + "\n", + "aa.display_df(df_cat, n_rows=5)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-01T12:01:06.950226733Z", - "start_time": "2024-07-01T12:01:06.895543108Z" + "end_time": "2024-07-01T13:42:52.419140553Z", + "start_time": "2024-07-01T13:42:52.328580814Z" } }, "id": "376640688cf8ec0c" @@ -51,20 +62,20 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 15, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "From 100 scales, the following 69 scales were selected:\n", - "DataFrame shape: (69, 5)\n" + "From 100 scales, the following 71 scales were selected:\n", + "DataFrame shape: (71, 5)\n" ] }, { "data": { "text/plain": "", - "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 scale_idcategorysubcategoryscale_namescale_description
7CHOC760102ASA/VolumeAccessible surface area (ASA)ASA (folded proteins)Residue accessi...(Chothia, 1976)
24BIOV880101ASA/VolumeBuriedBuriabilityInformation val...u et al., 1988)
28ARGP820103ASA/VolumeBuriedBuriedMembrane-buried...s et al., 1982)
30CHOC760104ASA/VolumeBuriedBuriedProportion of r...(Chothia, 1976)
46COHE430101ASA/VolumePartial specific volumePartial specific volumePartial specifi...n-Edsall, 1943)
\n" + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 scale_idcategorysubcategoryscale_namescale_description
7CHOC760102ASA/VolumeAccessible surface area (ASA)ASA (folded proteins)Residue accessi...(Chothia, 1976)
24BIOV880101ASA/VolumeBuriedBuriabilityInformation val...u et al., 1988)
28ARGP820103ASA/VolumeBuriedBuriedMembrane-buried...s et al., 1982)
30CHOC760104ASA/VolumeBuriedBuriedProportion of r...(Chothia, 1976)
46COHE430101ASA/VolumePartial specific volumePartial specific volumePartial specifi...n-Edsall, 1943)
\n" }, "metadata": {}, "output_type": "display_data" @@ -75,13 +86,14 @@ "selected_scale_ids = aac.filter_coverage(X, scale_ids=scale_ids, df_cat=df_cat, names_ref=names_ref)\n", "print(f\"From 100 scales, the following {len(selected_scale_ids)} scales were selected:\")\n", "df_cat_selected = df_cat[df_cat[\"scale_id\"].isin(selected_scale_ids)]\n", + "\n", "aa.display_df(df_cat_selected, show_shape=True, n_rows=5)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-01T12:01:09.219343360Z", - "start_time": "2024-07-01T12:01:08.247292300Z" + "end_time": "2024-07-01T13:42:08.693242009Z", + "start_time": "2024-07-01T13:42:07.582210186Z" } }, "id": "7c8aba23004a2d86" @@ -98,7 +110,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 16, "outputs": [ { "name": "stdout", @@ -111,7 +123,7 @@ { "data": { "text/plain": "", - "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 scale_idcategorysubcategoryscale_namescale_description
24BIOV880101ASA/VolumeBuriedBuriabilityInformation val...u et al., 1988)
28ARGP820103ASA/VolumeBuriedBuriedMembrane-buried...s et al., 1982)
29CHOC760103ASA/VolumeBuriedBuriedProportion of r...(Chothia, 1976)
46COHE430101ASA/VolumePartial specific volumePartial specific volumePartial specifi...n-Edsall, 1943)
48CHOC760101ASA/VolumeVolumeAccessible surface area (ASA)Residue accessi...(Chothia, 1976)
\n" + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 scale_idcategorysubcategoryscale_namescale_description
28ARGP820103ASA/VolumeBuriedBuriedMembrane-buried...s et al., 1982)
29CHOC760103ASA/VolumeBuriedBuriedProportion of r...(Chothia, 1976)
46COHE430101ASA/VolumePartial specific volumePartial specific volumePartial specifi...n-Edsall, 1943)
48CHOC760101ASA/VolumeVolumeAccessible surface area (ASA)Residue accessi...(Chothia, 1976)
65DAYM780101CompositionAA compositionAA compositionAmino acid comp... et al., 1978a)
\n" }, "metadata": {}, "output_type": "display_data" @@ -122,13 +134,14 @@ "selected_scale_ids = aac.filter_coverage(X, scale_ids=scale_ids, df_cat=df_cat, names_ref=names_ref, min_coverage=50)\n", "print(f\"From 100 scales, the following {len(selected_scale_ids)} scales were selected:\")\n", "df_cat_selected = df_cat[df_cat[\"scale_id\"].isin(selected_scale_ids)]\n", + "\n", "aa.display_df(df_cat_selected, show_shape=True, n_rows=5)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-01T12:01:10.519883056Z", - "start_time": "2024-07-01T12:01:10.479240121Z" + "end_time": "2024-07-01T13:42:08.757536540Z", + "start_time": "2024-07-01T13:42:08.701941134Z" } }, "id": "ac808ecae1fa7119" @@ -145,20 +158,20 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 17, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "From 100 scales, the following 23 scales were selected:\n", - "DataFrame shape: (23, 5)\n" + "From 100 scales, the following 25 scales were selected:\n", + "DataFrame shape: (25, 5)\n" ] }, { "data": { "text/plain": "", - "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 scale_idcategorysubcategoryscale_namescale_description
24BIOV880101ASA/VolumeBuriedBuriabilityInformation val...u et al., 1988)
30CHOC760104ASA/VolumeBuriedBuriedProportion of r...(Chothia, 1976)
46COHE430101ASA/VolumePartial specific volumePartial specific volumePartial specifi...n-Edsall, 1943)
48CHOC760101ASA/VolumeVolumeAccessible surface area (ASA)Residue accessi...(Chothia, 1976)
54DAWD720101ASA/VolumeVolumeVolumeSize (Dawson, 1972)
\n" + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 scale_idcategorysubcategoryscale_namescale_description
28ARGP820103ASA/VolumeBuriedBuriedMembrane-buried...s et al., 1982)
46COHE430101ASA/VolumePartial specific volumePartial specific volumePartial specifi...n-Edsall, 1943)
48CHOC760101ASA/VolumeVolumeAccessible surface area (ASA)Residue accessi...(Chothia, 1976)
65DAYM780101CompositionAA compositionAA compositionAmino acid comp... et al., 1978a)
120BULH740101CompositionMPs (anchor)TFE to surfaceTransfer free e...l-Breese, 1974)
\n" }, "metadata": {}, "output_type": "display_data" @@ -173,13 +186,14 @@ "selected_scale_ids = aac.filter_coverage(X, scale_ids=scale_ids, df_cat=df_cat, names_ref=names_ref, col_name=\"category\")\n", "print(f\"From 100 scales, the following {len(selected_scale_ids)} scales were selected:\")\n", "df_cat_selected = df_cat[df_cat[\"scale_id\"].isin(selected_scale_ids)]\n", + "\n", "aa.display_df(df_cat_selected, show_shape=True, n_rows=5)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-07-01T12:01:12.321903255Z", - "start_time": "2024-07-01T12:01:12.160879261Z" + "end_time": "2024-07-01T13:42:09.855803848Z", + "start_time": "2024-07-01T13:42:09.720455538Z" } }, "id": "6ec0a4614bde0579"