diff --git a/.gitignore b/.gitignore index 4fc0837..bc2c79a 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ sra_metadata/GSE192902* sra_metadata/moufarrej_geo* src/__pycache__ +src/.ipynb_checkpoints/ .Rproj.user *.Rproj diff --git a/src/gene_coverage_plot.ipynb b/src/gene_coverage_plot.ipynb new file mode 100644 index 0000000..cb5c870 --- /dev/null +++ b/src/gene_coverage_plot.ipynb @@ -0,0 +1,560 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "5da38501", + "metadata": {}, + "source": [ + "## OWNER" + ] + }, + { + "cell_type": "markdown", + "id": "ebcc104f", + "metadata": {}, + "source": [ + "Name: Flomics Biotech SL \n", + "\n", + "date: 19/06/2025" + ] + }, + { + "cell_type": "markdown", + "id": "4ae957f0", + "metadata": {}, + "source": [ + "## MAIN" + ] + }, + { + "cell_type": "markdown", + "id": "52e31236", + "metadata": {}, + "source": [ + "Generate gene coverage plot" + ] + }, + { + "cell_type": "markdown", + "id": "fc154ab0", + "metadata": {}, + "source": [ + "## LIBRARIES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d40ba3ab-b568-4dfb-acdb-c8450c9eb50c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "import sys\n", + "import json" + ] + }, + { + "cell_type": "markdown", + "id": "530bed1f", + "metadata": {}, + "source": [ + "## FUNCTIONS/PAR_SETTING" + ] + }, + { + "cell_type": "markdown", + "id": "f2da9e8f-9e77-42a5-84ef-141702556023", + "metadata": {}, + "source": [ + "### Set font for plots" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2064c8d6-e67a-4e66-a909-89df7752e0ec", + "metadata": {}, + "outputs": [], + "source": [ + "plt.rcParams['font.family'] = 'sans-serif'\n", + "plt.rcParams['font.sans-serif'] = ['Arial']\n", + "plt.rcParams['mathtext.fontset'] = 'custom'\n", + "plt.rcParams['mathtext.rm'] = 'Arial'\n", + "plt.rcParams['mathtext.it'] = 'Arial:italic'\n", + "plt.rcParams['mathtext.bf'] = 'Arial:bold'" + ] + }, + { + "cell_type": "markdown", + "id": "40fe77e4-60f6-4132-a15b-f9d0c1458253", + "metadata": {}, + "source": [ + "### External scripts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b701cdfa-4d91-49ad-a0ab-d27ec1f9a981", + "metadata": {}, + "outputs": [], + "source": [ + "#dodged barplot function\n", + "script_dir = \"./\"\n", + "sys.path.append(script_dir)\n", + "\n", + "from ridgeline_from_known_density_plot import ridgeline_from_known_density_plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e51378d-bfcf-4d8c-ac17-af501ab2ad56", + "metadata": {}, + "outputs": [], + "source": [ + "def find_results_folder(base_dataset_path, prefix):\n", + " for folder in os.listdir(base_dataset_path):\n", + " if os.path.isdir(os.path.join(base_dataset_path, folder)):\n", + " if prefix in folder and 'results' in folder:\n", + " return folder\n", + " return None" + ] + }, + { + "cell_type": "markdown", + "id": "a234ae7e", + "metadata": {}, + "source": [ + "## WORKFLOW" + ] + }, + { + "cell_type": "markdown", + "id": "253950be-269a-4adb-b97f-6a3800d37b73", + "metadata": {}, + "source": [ + "### Setup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e6bf5f86-330b-42a8-8dc5-1868b714ce44", + "metadata": {}, + "outputs": [], + "source": [ + "base_folder = './'\n", + "chalasani_folder = './'\n", + "output_folder = os.getcwd()\n", + "\n", + "datasets = {\n", + " 'block': {'sliced': True, 'n_parts': 4, 'prefix': 'block'},\n", + " 'chalasani': {'sliced': True, 'n_parts': 6, 'prefix': 'chalasani'},\n", + " 'chen': {'sliced': True, 'n_parts': 21, 'prefix': 'chen'},\n", + " 'flomics_2': {'sliced': False, 'subfolder_path': 'Flomics_2'},\n", + " 'decru': {'sliced': True, 'n_parts': 6, 'prefix': 'decru'},\n", + " 'giraldez': {'sliced': False, 'prefix': 'giraldez_results_full'},\n", + " 'ibarra': {'sliced': True, 'n_parts': 14, 'prefix': 'ibarra'},\n", + " 'moufarrej': {'sliced': True, 'n_parts': 17, 'prefix': 'moufarrej'},\n", + " 'ngo': {'sliced': False, 'prefix': 'ngo'},\n", + " 'reggiardo': {'sliced': False, 'prefix': 'reggiardo'},\n", + " 'roskams': {'sliced': True, 'n_parts': 6, 'prefix': 'roskams'},\n", + " 'rozowsky': {'sliced': True, 'n_parts': 9, 'prefix': 'rozowsky'},\n", + " 'sun': {'sliced': True, 'n_parts': 3, 'prefix': 'sun'},\n", + " 'tao': {'sliced': True, 'n_parts': 12, 'prefix': 'tao'},\n", + " #'taowei': {'sliced': True, 'n_parts': 5, 'prefix': 'taowei'},\n", + " 'toden': {'sliced': True, 'n_parts': 5, 'prefix': 'toden'},\n", + " 'wang': {'sliced': True, 'n_parts': 4, 'prefix': 'wang'},\n", + " 'zhu': {'sliced': True, 'n_parts': 5, 'prefix': 'zhu'},\n", + "}\n", + "\n", + "dataset_colors = {\n", + " \"block\": \"#b3b3b3\", \"decru\": \"#009E73\", \"zhu\": \"#ffd633\", \"chen\": \"#997a00\", \"flomics_2\": \"#144D6B\",\n", + " \"ngo\": \"#fa8072\", \"roskams\": \"#944dff\", \"moufarrej\": \"#CC79A7\", \"sun\": \"#D55E00\",\n", + " \"tao\": \"#0072B2\", \"toden\": \"#800099\", \"ibarra\": \"#800000\", \"chalasani_merged\": \"#800040\",\n", + " \"rozowsky\": \"#006600\", \"taowei\": \"#B32400\", \"giraldez\": \"#B1CC71\", \"reggiardo\": \"#F1085C\", \"wang\": \"#FE8F42\"\n", + "}" + ] + }, + { + "cell_type": "markdown", + "id": "2f40a95d-f889-4aec-b65c-cbd798946b35", + "metadata": {}, + "source": [ + "### Save per sample gene coverage" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "883c4a91-9ff1-4c2a-9e7b-9fdf6615d91d", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# ---- Process ----\n", + "all_profiles = {}\n", + "\n", + "for dataset, meta in datasets.items():\n", + " sliced = meta['sliced']\n", + " n_parts = meta.get('n_parts', 1)\n", + " prefix = meta.get('prefix', None) # Only exists if sliced or traditional unsliced\n", + " print(f\"\\nProcessing dataset: {dataset}\")\n", + "\n", + " if sliced:\n", + " for i in range(n_parts):\n", + " print(f\"\\nProcessing dataset: {dataset+\"_\"+str(i)}\")\n", + " part_folder = f\"{prefix}_results_{i:02d}.part/multiqc/star_salmon/multiqc_data\"\n", + " if dataset == \"chalasani\":\n", + " part_folder = chalasani_folder+f\"{prefix}_results_{i:02d}.part/multiqc/star_salmon/multiqc_data\"\n", + " file_path = os.path.join(base_folder, dataset, part_folder, \"mqc_qualimap_gene_coverage_profile_Normalised.txt\")\n", + " if os.path.isfile(file_path):\n", + " try:\n", + " df = pd.read_csv(file_path, sep=\"\\t\")\n", + " print(df.shape)\n", + " all_profiles[dataset+\"_\"+str(i)] = df\n", + " #handle special case (decru)\n", + " if dataset == \"decru\":\n", + " df[\"Sample\"] = df[\"Sample\"].str.split('_').str[0]\n", + " #handle special case (chalasani)\n", + " if dataset == \"chalasani_merged\":\n", + " df[\"Sample\"] = \"X\" + df[\"Sample\"].astype(str)\n", + " if dataset in [\"chalasani_merged\",\"decru\",\"toden\"]:\n", + " print(df.head())\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " else:\n", + " print(f\"Missing: {file_path}\")\n", + " else:\n", + " if \"subfolder_path\" in meta:\n", + " part_path = os.path.join(base_folder, meta[\"subfolder_path\"], \"multiqc\", \"star_salmon\", \"multiqc_data\")\n", + " else:\n", + " base_dataset_path = os.path.join(base_folder, dataset)\n", + " matched_folder = find_results_folder(base_dataset_path, prefix)\n", + " if not matched_folder:\n", + " print(f\"Could not find results folder for: {dataset}\")\n", + " continue\n", + " part_path = os.path.join(base_dataset_path, matched_folder, \"multiqc\", \"star_salmon\", \"multiqc_data\")\n", + "\n", + " file_path = os.path.join(part_path, \"mqc_qualimap_gene_coverage_profile_Normalised.txt\")\n", + " if os.path.isfile(file_path):\n", + " try:\n", + " df = pd.read_csv(file_path, sep=\"\\t\")\n", + " print(df.shape)\n", + " print(df.head())\n", + " all_profiles[dataset+\"_\"+str(part_path)] = df\n", + " except Exception as e:\n", + " print(f\"Error reading {file_path}: {e}\")\n", + " else:\n", + " print(f\"Missing: {file_path}\")" + ] + }, + { + "cell_type": "markdown", + "id": "e8caf53b-a5e8-48be-8b33-be5aec20e062", + "metadata": {}, + "source": [ + "### preparing dataset for ridge plot" + ] + }, + { + "cell_type": "markdown", + "id": "6dbcf992-ed2c-4cc4-a431-e9e3135d03c4", + "metadata": {}, + "source": [ + "#### Merge gene coverage in one dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3594b8e5-f53a-4b96-af29-cfae9a1e3ade", + "metadata": {}, + "outputs": [], + "source": [ + "# List to hold individual DataFrames\n", + "all_dataframes = []\n", + "\n", + "# Iterate through the dictionary and append each DataFrame to the list\n", + "for key, df in all_profiles.items():\n", + " all_dataframes.append(df)\n", + " \n", + "\n", + "# Concatenate all DataFrames into a single, unique dataset\n", + "sample_gene_coverage = pd.concat(all_dataframes)\n", + "sample_gene_coverage = sample_gene_coverage.set_index('Sample')\n", + "print(sample_gene_coverage.shape)\n", + "sample_gene_coverage.head()" + ] + }, + { + "cell_type": "markdown", + "id": "af7b6a41-64ba-43dd-b806-edff48a450ce", + "metadata": {}, + "source": [ + "#### Merge gene coverage with metadata (handle specific sample names)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1f93ab25-b5d2-4acc-89a1-b02ef7d07f58", + "metadata": {}, + "outputs": [], + "source": [ + "#load metadata\n", + "metadata = pd.read_csv(\"../tables/cfRNA-meta_per_sample_metadata.tsv\",sep=\"\\t\")\n", + "#handle special case (Flomics_1 and Flomics2)\n", + "condition = metadata['dataset_batch'].str.contains(r'^flomics_.*', na=False)\n", + "metadata.loc[condition, 'run'] = metadata['sample_display_name']\n", + "#set index\n", + "metadata = metadata.set_index(\"run\")\n", + "print(metadata.shape)\n", + "metadata.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "587b0e7b-17f6-40d3-911b-9bf62775f082", + "metadata": {}, + "outputs": [], + "source": [ + "#merge with metadata\n", + "full_gene_coverage_data = sample_gene_coverage.join(metadata, how='inner')\n", + "print(full_gene_coverage_data.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4fc674d2-f9d7-4222-be2f-928f55c611e9", + "metadata": {}, + "outputs": [], + "source": [ + "list(set(sample_gene_coverage.index) - set(full_gene_coverage_data.index))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "404ecf53-32e5-40d0-8f75-83ebc22fb5db", + "metadata": {}, + "outputs": [], + "source": [ + "list(set(metadata[\"dataset_short_name\"].unique()) - set(full_gene_coverage_data[\"dataset_short_name\"].unique()))" + ] + }, + { + "cell_type": "markdown", + "id": "5d50cd9f-530a-4f12-910a-f40576100669", + "metadata": {}, + "source": [ + "#### rename batches name with standard names and load batches proper colors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "875bc923-3c44-4511-9bc3-f056f1707220", + "metadata": {}, + "outputs": [], + "source": [ + "#load JSON file\n", + "with open(\"./dataset_mappings.json\") as json_file:\n", + " data_mapping = json.load(json_file)\n", + "\n", + "order_label = data_mapping['datasetVisualOrder']\n", + "color_dataset = data_mapping['datasetsPalette']\n", + "data_mapping = data_mapping['datasetsLabels']\n", + "order_label = [data_mapping.get(item, item) for item in order_label]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f20bb0a3-2304-457b-a9d9-ab04d0562daa", + "metadata": {}, + "outputs": [], + "source": [ + "#replace names in batches column\n", + "full_gene_coverage_data[\"dataset_batch\"].replace(data_mapping, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7a1a9d15-f5f3-4b13-b79b-9826dcea4bd4", + "metadata": {}, + "outputs": [], + "source": [ + "full_gene_coverage_data[\"dataset_batch\"]" + ] + }, + { + "cell_type": "markdown", + "id": "ca19bc3f-c58c-48dd-8338-ddaf40e5ef4e", + "metadata": {}, + "source": [ + "#### melt dataset" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "a6e8fc3f-ab65-4935-97c3-2f76e9e56d58", + "metadata": {}, + "outputs": [], + "source": [ + "#reset index\n", + "full_gene_coverage_data['Sample'] = full_gene_coverage_data.index\n", + "full_gene_coverage_data = full_gene_coverage_data.reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "9d0b06e8-148a-4f04-b39b-5d5c66426304", + "metadata": {}, + "outputs": [], + "source": [ + "position_list = [str(i) for i in range(0, 99)]\n", + "print(position_list)\n", + "print(full_gene_coverage_data.columns)\n", + "melt_dataset = full_gene_coverage_data.melt(id_vars=['dataset_batch'], value_vars=position_list)\n", + "melt_dataset[\"variable\"] = pd.to_numeric(melt_dataset[\"variable\"])\n", + "print(melt_dataset.shape)\n", + "melt_dataset.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "5d3ca5e0-6cb9-4b19-88ba-8e04de9638cc", + "metadata": {}, + "outputs": [], + "source": [ + "#correct dataset labels in color dict\n", + "dataset_batch_colors = {data_mapping.get(old_key, old_key): value \n", + " for old_key, value in color_dataset.items()}\n", + "\n", + "print(dataset_batch_colors)" + ] + }, + { + "cell_type": "markdown", + "id": "e7d205ca-1721-4fea-8ebf-596157f32803", + "metadata": {}, + "source": [ + "### ridge plot" + ] + }, + { + "cell_type": "markdown", + "id": "28f42c14-2791-41d6-85f2-10e6811ee46a", + "metadata": {}, + "source": [ + "#### default plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "6fdcbbc1-399e-420c-b410-4d72949f43bf", + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "reload(heatmap_plot)\n", + "\n", + "ridgeline_from_known_density_plot(\n", + " dataset = melt_dataset,\n", + " x_column = \"variable\",\n", + " density_column = \"value\",\n", + " category_column = \"dataset_batch\",\n", + " fig_width = 10,\n", + " fig_height = 10,\n", + " output = \"gene_coverage_merged\",\n", + " show_ylabel = False,\n", + " xlabel = \"Transcript position (%)\",\n", + " xlabel_va = 0.51,\n", + " fill_color = dataset_batch_colors,\n", + " show_title = False,\n", + " line_width = 0.5,\n", + " normalization = 'mean'\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2e13e62a-3b9c-4518-960f-ad431935a05e", + "metadata": {}, + "source": [ + "#### Y-axis plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "45d2930e-da88-4b9d-98e1-3d543eea0c76", + "metadata": {}, + "outputs": [], + "source": [ + "reload(heatmap_plot)\n", + "\n", + "ridgeline_from_known_density_plot(\n", + " dataset = melt_dataset,\n", + " x_column = \"variable\",\n", + " density_column = \"value\",\n", + " category_column = \"dataset_batch\",\n", + " category_order = order_label[:-1],\n", + " fig_width = 10,\n", + " fig_height = 15,\n", + " output = \"gene_coverage_merged_y_axis\",\n", + " #layout\n", + " show_ylabel = True,\n", + " xlabel = \"Transcript position (%)\",\n", + " xlabel_va = 0.51,\n", + " fill_color = dataset_batch_colors,\n", + " show_title = False,\n", + " line_width = 0.5,\n", + " y_ticks_frequency = 1,\n", + " normalization = 'mean',\n", + " #individual sub-plots\n", + " show_individual_yaxis = True,\n", + " show_individual_hspace = 0.1,\n", + " consistent_y_scale = False,\n", + " show_yticks = False,\n", + " ylabel_fontsize = 10\n", + ")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/gene_coverage_profile_fig2_tmpH.ipynb b/src/gene_coverage_profile_fig2_tmpH.ipynb deleted file mode 100644 index 7ea2198..0000000 --- a/src/gene_coverage_profile_fig2_tmpH.ipynb +++ /dev/null @@ -1,283 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "id": "59017647-22aa-469c-b8fe-ba2d91c2a1d0", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import pandas as pd\n", - "import matplotlib.pyplot as plt" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8c8c3748-333d-4c4c-aa70-7ec8cc49ee1e", - "metadata": {}, - "outputs": [], - "source": [ - "# ---- Helper ----\n", - "def find_results_folder(base_dataset_path, prefix):\n", - " for folder in os.listdir(base_dataset_path):\n", - " if os.path.isdir(os.path.join(base_dataset_path, folder)):\n", - " if prefix in folder and 'results' in folder:\n", - " return folder\n", - " return None\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ee4b7a1e-5910-455e-ae58-69d72ad0e184", - "metadata": {}, - "outputs": [], - "source": [ - "# ---- Setup ----\n", - "base_folder = '/mnt/s3fs/flomics-data/FL-cfRNA-meta'\n", - "output_folder = os.getcwd()\n", - "\n", - "datasets = {\n", - " 'block': {'sliced': True, 'n_parts': 4, 'prefix': 'block'},\n", - " 'chalasani_merged': {'sliced': True, 'n_parts': 5, 'prefix': 'chalasani'},\n", - " 'chen': {'sliced': True, 'n_parts': 21, 'prefix': 'chen'},\n", - " 'flomics_1': {'sliced': False, 'subfolder_path': 'Flomics_1/2025_05_16_11_26_41/n3abcb4a/'},\n", - " 'flomics_2': {'sliced': False, 'subfolder_path': 'Flomics_2/2025_06_05_09_32_11/n61e05f8/'},\n", - " 'decru': {'sliced': True, 'n_parts': 6, 'prefix': 'decru'},\n", - " 'giraldez': {'sliced': False, 'prefix': 'giraldez'},\n", - " 'ibarra': {'sliced': True, 'n_parts': 14, 'prefix': 'ibarra'},\n", - " 'moufarrej': {'sliced': True, 'n_parts': 17, 'prefix': 'moufarrej'},\n", - " 'ngo': {'sliced': False, 'prefix': 'ngo'},\n", - " 'reggiardo': {'sliced': False, 'prefix': 'reggiardo'},\n", - " 'roskams': {'sliced': True, 'n_parts': 6, 'prefix': 'roskams'},\n", - " 'rozowsky': {'sliced': True, 'n_parts': 9, 'prefix': 'rozowsky'},\n", - " 'sun': {'sliced': True, 'n_parts': 3, 'prefix': 'sun'},\n", - " 'tao': {'sliced': True, 'n_parts': 12, 'prefix': 'tao'},\n", - " #'taowei': {'sliced': True, 'n_parts': 4, 'prefix': 'taowei'},\n", - " 'toden': {'sliced': True, 'n_parts': 14, 'prefix': 'toden'},\n", - " 'wang': {'sliced': True, 'n_parts': 4, 'prefix': 'wang'},\n", - " 'zhu': {'sliced': True, 'n_parts': 5, 'prefix': 'zhu'},\n", - "}\n", - "\n", - "dataset_colors = {\n", - " \"block\": \"#b3b3b3\", \"decru\": \"#009E73\", \"zhu\": \"#ffd633\", \"chen\": \"#997a00\", \"flomics_1\" : \"#9AB9D6\", \"flomics_2\": \"#144D6B\",\n", - " \"ngo\": \"#fa8072\", \"roskams\": \"#944dff\", \"moufarrej\": \"#CC79A7\", \"sun\": \"#D55E00\",\n", - " \"tao\": \"#0072B2\", \"toden\": \"#800099\", \"ibarra\": \"#800000\", \"chalasani_merged\": \"#800040\",\n", - " \"rozowsky\": \"#006600\", \"taowei\": \"#B32400\", \"giraldez\": \"#B1CC71\", \"reggiardo\": \"#F1085C\", \"wang\": \"#FE8F42\"\n", - "}" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "aa786704-48cc-41f8-849a-2b651259c74e", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Processing dataset: block\n", - "Saved: block_multiqc_coverage.png\n", - "\n", - "Processing dataset: chalasani_merged\n", - "Saved: chalasani_merged_multiqc_coverage.png\n", - "\n", - "Processing dataset: chen\n", - "Saved: chen_multiqc_coverage.png\n", - "\n", - "Processing dataset: flomics_1\n", - "Saved: flomics_1_multiqc_coverage.png\n", - "\n", - "Processing dataset: flomics_2\n", - "Saved: flomics_2_multiqc_coverage.png\n", - "\n", - "Processing dataset: decru\n", - "Saved: decru_multiqc_coverage.png\n", - "\n", - "Processing dataset: giraldez\n", - "Saved: giraldez_multiqc_coverage.png\n", - "\n", - "Processing dataset: ibarra\n", - "Saved: ibarra_multiqc_coverage.png\n", - "\n", - "Processing dataset: moufarrej\n", - "Saved: moufarrej_multiqc_coverage.png\n", - "\n", - "Processing dataset: ngo\n", - "Saved: ngo_multiqc_coverage.png\n", - "\n", - "Processing dataset: reggiardo\n", - "Saved: reggiardo_multiqc_coverage.png\n", - "\n", - "Processing dataset: roskams\n", - "Saved: roskams_multiqc_coverage.png\n", - "\n", - "Processing dataset: rozowsky\n", - "Saved: rozowsky_multiqc_coverage.png\n", - "\n", - "Processing dataset: sun\n", - "Saved: sun_multiqc_coverage.png\n", - "\n", - "Processing dataset: tao\n", - "Saved: tao_multiqc_coverage.png\n", - "\n", - "Processing dataset: toden\n", - "Saved: toden_multiqc_coverage.png\n", - "\n", - "Processing dataset: wang\n", - "Saved: wang_multiqc_coverage.png\n", - "\n", - "Processing dataset: zhu\n", - "Saved: zhu_multiqc_coverage.png\n", - "\n", - "✅ All per-dataset and combined plots saved.\n" - ] - } - ], - "source": [ - "\n", - "\n", - "\n", - "# ---- Process ----\n", - "all_profiles = {}\n", - "\n", - "for dataset, meta in datasets.items():\n", - " sliced = meta['sliced']\n", - " n_parts = meta.get('n_parts', 1)\n", - " prefix = meta.get('prefix', None) # Only exists if sliced or traditional unsliced\n", - " print(f\"\\nProcessing dataset: {dataset}\")\n", - " \n", - " part_means = []\n", - "\n", - " if sliced:\n", - " for i in range(n_parts):\n", - " part_folder = f\"{prefix}_results_{i:02d}.part/multiqc/star_salmon/multiqc_data\"\n", - " file_path = os.path.join(base_folder, dataset, part_folder, \"mqc_qualimap_gene_coverage_profile_Normalised.txt\")\n", - " if os.path.isfile(file_path):\n", - " try:\n", - " df = pd.read_csv(file_path, sep=\"\\t\")\n", - " sample_only = df.drop(columns=[\"Sample\"])\n", - " part_means.append(sample_only.mean(axis=0))\n", - " except Exception as e:\n", - " print(f\"Error reading {file_path}: {e}\")\n", - " else:\n", - " print(f\"Missing: {file_path}\")\n", - " else:\n", - " if \"subfolder_path\" in meta:\n", - " part_path = os.path.join(base_folder, meta[\"subfolder_path\"], \"multiqc\", \"star_salmon\", \"multiqc_data\")\n", - " else:\n", - " base_dataset_path = os.path.join(base_folder, dataset)\n", - " matched_folder = find_results_folder(base_dataset_path, prefix)\n", - " if not matched_folder:\n", - " print(f\"Could not find results folder for: {dataset}\")\n", - " continue\n", - " part_path = os.path.join(base_dataset_path, matched_folder, \"multiqc\", \"star_salmon\", \"multiqc_data\")\n", - "\n", - " file_path = os.path.join(part_path, \"mqc_qualimap_gene_coverage_profile_Normalised.txt\")\n", - " if os.path.isfile(file_path):\n", - " try:\n", - " df = pd.read_csv(file_path, sep=\"\\t\")\n", - " sample_only = df.drop(columns=[\"Sample\"])\n", - " part_means.append(sample_only.mean(axis=0))\n", - " except Exception as e:\n", - " print(f\"Error reading {file_path}: {e}\")\n", - " else:\n", - " print(f\"Missing: {file_path}\")\n", - "\n", - "\n", - " # Average across all .part means\n", - " if part_means:\n", - " dataset_mean_df = pd.DataFrame(part_means).mean(axis=0).reset_index()\n", - " dataset_mean_df.columns = [\"position\", \"coverage\"]\n", - " dataset_mean_df[\"position\"] = dataset_mean_df[\"position\"].astype(int)\n", - " dataset_mean_df.sort_values(\"position\", inplace=True)\n", - " all_profiles[dataset] = dataset_mean_df\n", - "\n", - " # Per-dataset plot\n", - " plt.figure(figsize=(10, 4))\n", - " plt.plot(dataset_mean_df[\"position\"], dataset_mean_df[\"coverage\"],\n", - " label=dataset, color=dataset_colors.get(dataset))\n", - " plt.title(f\"Normalized Transcript Coverage: {dataset}\")\n", - " plt.xlabel(\"Transcript position (%)\")\n", - " plt.ylabel(\"Normalized coverage\")\n", - " plt.tight_layout()\n", - " plt.savefig(os.path.join(output_folder, f\"{dataset}_multiqc_coverage.png\"))\n", - " plt.close()\n", - " print(f\"Saved: {dataset}_multiqc_coverage.png\")\n", - " else:\n", - " print(f\"No data for {dataset}\")\n", - "\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1b0cfff9-37eb-4d43-aafb-6d7d3ee88397", - "metadata": {}, - "outputs": [], - "source": [ - "dataset_labels = {\n", - " \"block\": \"Block\",\n", - " \"chalasani_merged\": \"Chalasani\",\n", - " \"chen\": \"Chen\",\n", - " \"flomics_1\" : \"Flomics 1\",\n", - " \"flomics_2\": \"Flomics 2\",\n", - " \"decru\": \"Decruyenaere\",\n", - " \"giraldez\": \"Giráldez\",\n", - " \"ibarra\": \"Ibarra\",\n", - " \"moufarrej\": \"Moufarrej\",\n", - " \"ngo\": \"Ngo\",\n", - " \"reggiardo\": \"Reggiardo\",\n", - " \"roskams\": \"Roskams-Hieter\",\n", - " \"rozowsky\": \"Rozowsky\",\n", - " \"sun\": \"Sun\",\n", - " \"tao\": \"Tao\",\n", - " \"taowei\": \"Wei\",\n", - " \"toden\": \"Toden\",\n", - " \"wang\": \"Wang\",\n", - " \"zhu\": \"Zhu\"\n", - "}\n", - "\n", - "# ---- Combined Plot ----\n", - "plt.figure(figsize=(12, 6))\n", - "for dataset, df in all_profiles.items():\n", - " plt.plot(df[\"position\"], df[\"coverage\"], label=dataset_labels.get(dataset, dataset), color=dataset_colors.get(dataset))\n", - "plt.xlabel(\"Transcript position (%)\")\n", - "plt.ylabel(\"Normalized coverage\")\n", - "plt.title(\"Average Gene Coverage per Dataset (MultiQC normalized)\")\n", - "plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')\n", - "plt.tight_layout()\n", - "plt.savefig(os.path.join(output_folder, \"multiqc_style_normalized_coverage_combined.png\"), dpi=600)\n", - "plt.close()\n", - "\n", - "print(\"All per-dataset and combined plots saved.\")\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/ridgeline_from_known_density_plot.py b/src/ridgeline_from_known_density_plot.py new file mode 100644 index 0000000..7d1dbea --- /dev/null +++ b/src/ridgeline_from_known_density_plot.py @@ -0,0 +1,312 @@ +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import os +import warnings +import seaborn as sns + +def ridgeline_from_known_density_plot(# ~ Basic input ~ # + dataset: pd.DataFrame, + x_column: str, + density_column: str, + category_column: str, + category_order: list = None, + overlap: float = 0.5, + fig_width: float = 10, + fig_height: float = 7, + output: str = None, + # ~ Layout ~ # + show_xlabel: bool = True, + xlabel: str = None, + xlabel_va: float = 0.5, + xlabel_fontsize: float = 12, + show_ylabel: bool = True, + ylabel: str = None, + ylabel_fontsize: float = 12, + show_title: bool = True, + title: str = None, + title_fontsize: float = 14, + xticks_fontsize: float = 10, + show_yticks: bool = True, + yticks_fontsize: float = 10, + y_ticks_frequency: float = 1, + fill_alpha: float = 0.7, + line_color: str = 'black', + fill_color: str = 'skyblue', + line_width: float = 0.5, + # ~ Normalization ~ # + normalization: str = None, + # ~ Individual sub-plots ~ # + show_individual_yaxis: bool = False, + show_individual_hspace: float = 0.4, + consistent_y_scale: bool = True): # New parameter + """ + Generate a ridgeline plot from pre-calculated density values. + + Args: + + # ~ Basic input ~ # + dataset (pd.DataFrame): The input pandas DataFrame containing pre-calculated densities values + x_column (str): Column containing the coordinates for the density curves. + density_column (str): Column containing the density values for the curves. + category_column (str): Column containing categories to group the curves. + category_order (list, optional): Order in which categories should be plotted from bottom to top. + overlap (float, optional): The degree of overlap between the lines (from 0 to 1). Defaults to 0.5. + fig_width (float, optional): The width of the figure in inches. Defaults to 10. + fig_height (float, optional): The height of the figure in inches. Defaults to 7. + output (str, optional): The full path filename (without extension) to save the plot. + + # ~ Layout ~ # + show_xlabel (bool, optional): Display the X-axis label. Defaults to True. + xlabel (str, optional): Custom label for the X-axis. Defaults to None. + xlabel_va (float, optional): X-axis label vertical adjustment. Default to 0.5. + xlabel_fontsize (float, optional): Font size for the X-axis label. Defaults to 12. + show_ylabel (bool, optional): Display the Y-axis label. Defaults to True + ylabel (str, optional): Custom label for the Y-axis. Defaults to None. + ylabel_fontsize (float, optional): Font size for the Y-axis label. Defaults to 12. + show_title (bool, optional): Display plot title. Defaults to True. + title (str, optional): Custom title. Defaults to None. + title_fontsize: float (optional): Font size for title. Defaults to 14. + xticks_fontsize (float, optional): Font size for the X-axis tick labels. Defaults to 10. + show_yticks (bool, optional): Show Y-axis ticks and label. Default to + yticks_fontsize (float, optional): Font size for the Y-axis tick labels. Defaults to 10. + y_ticks_frequency (float, optional): Frequency of Y-axis ticks. Default to 1 + fill_alpha (float, optional): Transparency of the filled area under the curves. Defaults to 0.7. + line_color (str, optional): Color of the curve lines. Defaults to 'black'. + fill_color (str or dict, optional): Color of the filled area under the curves. + Can be a single color string for all categories, + or a dictionary mapping category names + Defaults to 'skyblue'. + line_width (float, otiopnal): Size of the line in the curve. Default to 0.5. + + # ~ Normalization ~ # + + normalization (str, optional): normalization method for the curve. Default to None. + If "mean", {category_column} and {x_column} will be grouped to calculate the {density_column} mean + If "median", {category_column} and {x_column} will be grouped to calculate the {density_column} median + + # ~ Individual sub-plots ~ # + + show_individual_yaxis (bool, optional): If True, each category will have its own y-axis. + This overrides 'overlap' to 0. + Defaults to False. + show_individual_hspace (float, optional): The height spacing between subplots when `show_individual_yaxis` is True. Defaults to 0.4. + consistent_y_scale (bool, optional): If False, The y scale will be different for each subplot. Default to True + """ + + # Input Validation + if not isinstance(dataset, pd.DataFrame): + raise TypeError("Input dataset must be a pandas DataFrame.") + if x_column not in dataset.columns: + raise ValueError(f"Column '{x_column}' not found in the dataset.") + if density_column not in dataset.columns: + raise ValueError(f"Column '{density_column}' not found in the dataset.") + if category_column not in dataset.columns: + raise ValueError(f"Column '{category_column}' not found in the dataset.") + if not pd.api.types.is_numeric_dtype(dataset[x_column]): + raise TypeError(f"Column '{x_column}' must be numeric.") + if not pd.api.types.is_numeric_dtype(dataset[density_column]): + raise TypeError(f"Column '{density_column}' must be numeric.") + if not (0 <= overlap <= 1): + raise ValueError("Overlap must be between 0 and 1.") + if category_order is not None and not isinstance(category_order, list): + raise TypeError("category_order must be a list.") + if category_order is not None: + if not all(cat in dataset[category_column].unique() for cat in category_order): + raise ValueError("All categories in category_order must be present in category_column.") + if not isinstance(fill_color, (str, dict)): + raise TypeError("fill_color must be a string or a dictionary.") + if normalization is not None and normalization not in ["mean","median"]: + raise ValueError("Invalid value for `normalization`. Choose from 'mean' or 'median'.") + if show_individual_yaxis: + if overlap != 0: + warnings.warn("`show_individual_yaxis` is True, so `overlap` is being set to 0 to prevent overlap.", UserWarning) + overlap = 0 + + # ~ Normalization ~ # + if normalization == "mean": + dataset = dataset.groupby([category_column, x_column])[density_column].mean().reset_index() + if normalization == "median": + dataset = dataset.groupby([category_column, x_column])[density_column].median().reset_index() + + # ~ Get unique categories and define order ~ # + categories = dataset[category_column].unique() + if category_order is None: + categories = sorted(categories) + else: + # Filter and order categories based on category_order, ensuring they exist in the data + categories = [cat for cat in category_order if cat in categories] + + # ~ Assigning colors based on fill_color type ~ # + category_fill_colors = {} + if isinstance(fill_color, str): + # If a single color string, apply to all categories + for cat in categories: + category_fill_colors[cat] = fill_color + else: # It's a dictionary + prop_cycle = plt.rcParams['axes.prop_cycle'] + default_colors = prop_cycle.by_key()['color'] + color_idx = 0 + for cat in categories: + if cat in fill_color: + category_fill_colors[cat] = fill_color[cat] + else: + # Use default cycle color if not specified in the dictionary + category_fill_colors[cat] = default_colors[color_idx % len(default_colors)] + warnings.warn(f"No fill_color specified for category '{cat}'. Using default color: {category_fill_colors[cat]}", UserWarning) + color_idx += 1 + + if show_individual_yaxis: + # Create subplots + fig, axes = plt.subplots(len(categories), 1, figsize=(fig_width, fig_height), sharex=True, gridspec_kw={'hspace': show_individual_hspace}) + if len(categories) == 1: # If only one subplot, axes is not an array + axes = [axes] + else: + fig, ax = plt.subplots(figsize=(fig_width, fig_height)) + + # ~ Calculate the maximum density across all plots to determine the appropriate vertical shift ~ # + if not show_individual_yaxis: + max_overall_density = dataset[density_column].max() + if pd.isna(max_overall_density) or max_overall_density == 0: + vertical_shift_amount = 0.5 # A default small shift if no density is found + warnings.warn("No density found in the density_column. Plots may appear flat or empty.", UserWarning) + else: + vertical_shift_amount = max_overall_density * (1 - overlap) + else: + if consistent_y_scale: + vertical_shift_amount = 0 # No vertical shift for individual y-axes + # Determine max density for consistent Y-axis limit across subplots + global_max_density = dataset[density_column].max() + if pd.isna(global_max_density) or global_max_density == 0: + global_max_density = 1.0 + + + # ~ Plot each density curve ~ # + for i, category in enumerate(categories): + subset = dataset[dataset[category_column] == category].sort_values(by=x_column) + x_kde = subset[x_column].values + y_kde = subset[density_column].values + + if len(y_kde) == 0: + warnings.warn(f"Category '{category}' has no valid density data. Skipping plot for this category.", UserWarning) + continue + + if show_individual_yaxis: + current_ax = axes[i] + y_shifted = y_kde # No shift needed, each has its own axis + else: + current_ax = ax + y_shifted = y_kde + (i * vertical_shift_amount) + + # Get the appropriate fill color for the current category + current_fill_color = category_fill_colors.get(category, 'gray') # Fallback to gray if not mapped + + # Plot the filled area + if show_individual_yaxis: + current_ax.fill_between(x_kde, 0, y_shifted, # Start fill from 0 for individual y-axis + color=current_fill_color, alpha=fill_alpha) + if consistent_y_scale: + current_ax.yaxis.set_ticks(np.arange(0, global_max_density * 1.1, y_ticks_frequency)) # Set consistent Y-axis limits + else: + max_density = dataset[dataset[category_column] == category][density_column].max() + current_ax.yaxis.set_ticks(np.arange(0, max_density * 1.1, y_ticks_frequency)) # Set variable Y-axis limit + else: + current_ax.fill_between(x_kde, i * vertical_shift_amount, y_shifted, + color=current_fill_color, alpha=fill_alpha) + + + # Plot the outline + current_ax.plot(x_kde, y_shifted, color=line_color, linewidth=line_width) + + if show_individual_yaxis: + current_ax.tick_params(axis='x', labelsize=xticks_fontsize) + if show_yticks: + current_ax.tick_params(axis='y', labelsize=yticks_fontsize) + else: + current_ax.set_yticks([]) + current_ax.spines['right'].set_visible(False) + current_ax.spines['top'].set_visible(False) + current_ax.grid(axis='x', linestyle='--', alpha=0.7) + if show_ylabel: + current_ax.set_ylabel(category, fontsize=ylabel_fontsize, rotation=0, ha='right', va='center') + current_ax.yaxis.set_label_coords(-0.05, 0.5) + else: + current_ax.set_ylabel('') + + else: + # Add category label as a custom y-tick for ridgeline plot + current_ax.text(-0.02, i * vertical_shift_amount, str(category), va='center', ha='right', + fontsize=yticks_fontsize, color='black', transform=current_ax.get_yaxis_transform()) + + + # ~ Set x-axis label ~ # + if show_xlabel: + if show_individual_yaxis: + # Place x-label on the bottom-most subplot + bottom_ax = axes[-1] + bottom_ax.set_xlabel(xlabel if xlabel is not None else x_column, fontsize=xlabel_fontsize, x = xlabel_va, ha='center') + # Ensure ticks are visible on the bottom-most axis + bottom_ax.tick_params(axis='x', labelbottom=True) + else: + ax.set_xlabel(xlabel if xlabel is not None else x_column, fontsize=xlabel_fontsize, x = xlabel_va, ha='center') + else: + if show_individual_yaxis: + axes[-1].fig.supxlabel('') + else: + ax.set_xlabel('') + + # ~ Set y-axis label and ticks (hide default numerical y-axis for ridgeline) ~ # + if not show_individual_yaxis: + if show_ylabel: + ax.set_ylabel(ylabel if ylabel is not None else 'Category', fontsize=ylabel_fontsize) + else: + ax.set_ylabel('') + + ax.set_yticks([]) + ax.tick_params(axis='y', length=0) + ax.tick_params(axis='x', labelsize=xticks_fontsize) + # ~ Remove top, right, and left plot lines for a cleaner look ~ # + ax.spines['left'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['top'].set_visible(False) + ax.yaxis.set_ticks_position('none') + ax.grid(axis='x', linestyle='--', alpha=0.7) + + + # ~ Set title ~ # + if show_title: + if show_individual_yaxis: + fig.suptitle(title if title is not None else f'Density Plots of {x_column} by {category_column}', + fontsize=title_fontsize) + else: + ax.set_title(title if title is not None else f'Ridgeline Plot of {x_column} Densities by {category_column}', + fontsize=title_fontsize) + + #normal layout for regular plot + if not show_individual_yaxis: + plt.tight_layout() + # Use fig.subplots_adjust for individual curve plots + #TO DO: make it automatic + if show_individual_yaxis: + fig.subplots_adjust(left = 0.3, bottom=0.03, top=1) + + # ~ Saving Plot (if output path is provided) ~ + if output: + dir_name = os.path.dirname(output) + if dir_name and not os.path.exists(dir_name): + os.makedirs(dir_name) + + filename_pdf = output + ".pdf" + plt.savefig(filename_pdf) + print(f"Plot saved to {filename_pdf}") + + filename_png = output + ".png" + plt.savefig(filename_png) + print(f"Plot saved to {filename_png}") + + filename_svg = output + ".svg" + plt.rcParams["svg.fonttype"] = "none" + plt.savefig(filename_svg) + print(f"Box plot saved to {filename_svg}") + + plt.show() \ No newline at end of file