diff --git a/examples/data_handling/filter_seq.ipynb b/examples/data_handling/filter_seq.ipynb index e78b3008..9651fe01 100644 --- a/examples/data_handling/filter_seq.ipynb +++ b/examples/data_handling/filter_seq.ipynb @@ -12,7 +12,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 1, "outputs": [], "source": [ "import aaanalysis as aa\n", @@ -22,8 +22,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:22:29.817583849Z", - "start_time": "2024-06-27T20:22:29.631806177Z" + "end_time": "2024-06-28T05:39:41.476978471Z", + "start_time": "2024-06-28T05:39:37.767522801Z" } }, "id": "f6652f89954b8969" @@ -40,7 +40,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 2, "outputs": [ { "name": "stdout", @@ -53,7 +53,7 @@ { "data": { "text/plain": "", - "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entryclusteridentity_with_repis_representative
1991CAPSID_1621990100.0000001
1992CAPSID_47751991100.0000001
1993CAPSID_50051992100.0000001
1994CAPSID_47011993100.0000001
1995CAPSID_49621994100.0000001
1996CAPSID_45171995100.0000001
1997CAPSID_45161996100.0000001
1998CAPSID_43001997100.0000001
1999CAPSID_41081998100.0000001
2000CAPSID_49841999100.0000001
\n" + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entryclusteridentity_with_repis_representative
1996CAPSID_45171995100.0000001
1997CAPSID_45161996100.0000001
1998CAPSID_43001997100.0000001
1999CAPSID_41081998100.0000001
2000CAPSID_49841999100.0000001
\n" }, "metadata": {}, "output_type": "display_data" @@ -71,7 +71,7 @@ "df_clust = aa.filter_seq(df_seq=df_seq)\n", "n_clust = df_clust[\"cluster\"].nunique()\n", "print(f\"Number of CD-HIT clusters: {n_clust}\")\n", - "aa.display_df(df_clust, n_rows=-10, show_shape=True)\n", + "aa.display_df(df_clust, n_rows=-5, show_shape=True)\n", "\n", "# Filtering using MMSeqs\n", "df_clust = aa.filter_seq(df_seq=df_seq, method=\"mmseqs\")\n", @@ -81,8 +81,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:22:45.025340662Z", - "start_time": "2024-06-27T20:22:30.433050007Z" + "end_time": "2024-06-28T05:40:12.343453881Z", + "start_time": "2024-06-28T05:39:41.481403647Z" } }, "id": "b9e4f7053030fd3f" @@ -99,7 +99,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 3, "outputs": [ { "name": "stdout", @@ -111,7 +111,7 @@ { "data": { "text/plain": "", - "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entryclusteridentity_with_repis_representative
1991CAPSID_47741988100.0000001
1992CAPSID_48061989100.0000001
1993CAPSID_48391990100.0000001
1994CAPSID_48711991100.0000001
1995CAPSID_49041992100.0000001
1996CAPSID_49361993100.0000001
1997CAPSID_49681994100.0000001
1998CAPSID_50021995100.0000001
1999CAPSID_50371996100.0000001
2000CAPSID_50691997100.0000001
\n" + "text/html": "\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
 entryclusteridentity_with_repis_representative
1996CAPSID_47261993100.0000001
1997CAPSID_47581994100.0000001
1998CAPSID_47901995100.0000001
1999CAPSID_48231996100.0000001
2000CAPSID_48551997100.0000001
\n" }, "metadata": {}, "output_type": "display_data" @@ -120,13 +120,13 @@ "source": [ "# Select redundancy-reduced sequences\n", "df_selected = df_clust[df_clust[\"is_representative\"] == 1]\n", - "aa.display_df(df_clust, n_rows=-10, show_shape=True)" + "aa.display_df(df_clust, n_rows=-5, show_shape=True)" ], "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:22:45.110610021Z", - "start_time": "2024-06-27T20:22:45.020134758Z" + "end_time": "2024-06-28T05:40:12.348662734Z", + "start_time": "2024-06-28T05:39:55.289555016Z" } }, "id": "50a17ad119d34446" @@ -143,7 +143,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 4, "outputs": [ { "name": "stdout", @@ -168,8 +168,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:23:26.767419508Z", - "start_time": "2024-06-27T20:22:45.081053810Z" + "end_time": "2024-06-28T05:40:36.565870468Z", + "start_time": "2024-06-28T05:39:55.302397196Z" } }, "id": "ef5135c86e164027" @@ -186,7 +186,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 5, "outputs": [ { "name": "stdout", @@ -204,8 +204,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:23:47.145309577Z", - "start_time": "2024-06-27T20:23:26.755456935Z" + "end_time": "2024-06-28T06:04:21.597822737Z", + "start_time": "2024-06-28T05:40:36.579933745Z" } }, "id": "7d55920a4d8a2183" @@ -222,7 +222,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 6, "outputs": [ { "name": "stdout", @@ -241,8 +241,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:23:50.951949901Z", - "start_time": "2024-06-27T20:23:47.156300620Z" + "end_time": "2024-06-28T06:04:21.607074402Z", + "start_time": "2024-06-28T05:40:56.604564848Z" } }, "id": "62d0351a3c25f270" @@ -259,7 +259,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "outputs": [ { "name": "stdout", @@ -284,8 +284,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:24:06.633475548Z", - "start_time": "2024-06-27T20:23:50.959943351Z" + "end_time": "2024-06-28T06:04:21.612534056Z", + "start_time": "2024-06-28T05:41:00.298248616Z" } }, "id": "5da69ca3ad47a131" @@ -302,7 +302,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 8, "outputs": [], "source": [ "# Sort sequences by clusters\n", @@ -311,8 +311,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:24:07.190489991Z", - "start_time": "2024-06-27T20:24:06.648321841Z" + "end_time": "2024-06-28T06:04:21.614335869Z", + "start_time": "2024-06-28T05:41:16.960240709Z" } }, "id": "ce1a5c401c42fe9d" @@ -329,14 +329,14 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 9, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Time without multiprocessing: 0.54 seconds\n", - "Time with multiprocessing. 0.68 seconds\n" + "Time with multiprocessing. 0.67 seconds\n" ] } ], @@ -358,8 +358,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-06-27T20:24:08.416840033Z", - "start_time": "2024-06-27T20:24:07.208454665Z" + "end_time": "2024-06-28T06:04:21.615894271Z", + "start_time": "2024-06-28T05:41:17.519080704Z" } }, "id": "87a6ef861d00ab12"