MannLabs · GeorgWa · Oct 17, 2023 · Oct 15, 2023 · Oct 15, 2023 · Oct 15, 2023
diff --git a/alphabase/protein/fasta.py b/alphabase/protein/fasta.py
@@ -5,6 +5,10 @@
 import os
 import itertools
 import copy
+import ahocorasick
+from tqdm import tqdm
+
+import warnings
 
 from Bio import SeqIO
 from typing import Union
@@ -46,13 +50,16 @@ def read_fasta_file(fasta_filename:str=""):
                 parts = record.id.split("|")  # pipe char
                 if len(parts) > 1:
                     id = parts[1]
+                    gene_org = parts[2]
                 else:
                     id = record.name
+                    gene_org = record.name
                 sequence = str(record.seq)
                 entry = {
                     "protein_id": id,
                     "full_name": record.name,
                     "gene_name": get_uniprot_gene_name(record.description),
+                    "gene_org": gene_org,
                     "description": record.description,
                     "sequence": sequence,
                 }
@@ -1310,3 +1317,78 @@ def load_hdf(self, hdf_file:str, load_mod_seq:bool=False):
             self.protein_df = _hdf.library.protein_df.values
         except (AttributeError, KeyError, ValueError, TypeError):
             print(f"No protein_df in {hdf_file}")
+
+def annotate_precursor_df(
+        precursor_df : pd.DataFrame,
+        protein_df : pd.DataFrame,
+    ):
+    """Annotate a list of peptides with genes and proteins by using an ahocorasick automaton.
+
+    Parameters
+    ----------
+
+    precursor_df : pd.DataFrame
+        A dataframe containing a sequence column.
+
+    protein_df : pd.DataFrame
+        protein dataframe containing `sequence` column.
+
+    Returns
+    -------
+
+    pd.DataFrame
+        updated precursor_df with `genes`, `proteins` and `cardinality` columns.
+
+    """
+    if len(precursor_df) == 0:
+        return precursor_df
+
+    if len(protein_df) == 0:
+        return precursor_df
+
+    if 'sequence' not in precursor_df.columns:
+        raise SystemError('precursor_df must contain a sequence column')
+
+    peptide_df = pd.DataFrame({
+        'sequence': precursor_df['sequence'].unique()
+    })
+
+    # ahocorasick automaton will be used to index the protein_df
+    automaton = ahocorasick.Automaton()
+    for i, peptide_sequence in enumerate(peptide_df['sequence']):
+        automaton.add_word(peptide_sequence, i)
+    automaton.make_automaton()
+
+    genes = [[] for _ in range(len(peptide_df))]
+    proteins = [[] for _ in range(len(peptide_df))]
+
+    # iter as dictionary
+    for protein_entry in tqdm(protein_df.to_dict('records')):
+        idx = [idx for _, idx in automaton.iter(protein_entry['sequence'])]
+        idx = np.unique(idx)
+        if len(idx) > 0:
+            for i in idx:
+                genes[i].append(protein_entry['gene_org'])
+                proteins[i].append(protein_entry['protein_id'])
+
+    peptide_df['genes'] = [';'.join(g) for g in genes]
+    peptide_df['proteins'] = [';'.join(g) for g in proteins]
+    peptide_df['cardinality'] = [len(g) for g in genes]
+
+    if 'genes' in precursor_df.columns:
+        precursor_df.drop(columns=['genes'], inplace=True)
+
+    if 'proteins' in precursor_df.columns:
+        precursor_df.drop(columns=['proteins'], inplace=True)
+
+    if 'proteotypic' in precursor_df.columns:
+        precursor_df.drop(columns=['proteotypic'], inplace=True)
+
+    if 'cardinality' in precursor_df.columns:
+        precursor_df.drop(columns=['cardinality'], inplace=True)
+
+    failed_annotation = np.sum(peptide_df['genes'] == '')
+    if failed_annotation > 0:
+        warnings.warn(f'{failed_annotation} peptides could not be annotated')
+
+    return precursor_df.merge(peptide_df, on='sequence', how='left')
diff --git a/alphabase/spectral_library/decoy.py b/alphabase/spectral_library/decoy.py
@@ -224,7 +224,7 @@ def get_decoy_lib(self, name:str,
                 target_lib, **kwargs
             )
         else:
-            return None
+            raise ValueError(f'Decoy method {name} not found.')
 
 decoy_lib_provider:SpecLibDecoyProvider = SpecLibDecoyProvider()
 """

diff --git a/nbdev_nbs/protein/annotate_fasta.ipynb b/nbdev_nbs/protein/annotate_fasta.ipynb
@@ -0,0 +1,126 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%reload_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from alphabase.protein import fasta\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "protein_df = pd.DataFrame([\n",
+    "    {\n",
+    "        'protein_id': 'xx',\n",
+    "        'full_name': 'xx_xx',\n",
+    "        'gene_name': 'x_x',\n",
+    "        'gene_org': 'xx_HUMAN',\n",
+    "        'sequence': 'MACDESTYKBKFGHIKLMNPQRST'\n",
+    "    },\n",
+    "    {\n",
+    "        'protein_id': 'yy',\n",
+    "        'full_name': 'yy_yy',\n",
+    "        'gene_name': 'y_y',\n",
+    "        'gene_org': 'yy_HUMAN',\n",
+    "        'sequence': 'FGHIKLMNPQR'\n",
+    "    }\n",
+    "])\n",
+    "\n",
+    "precursor_df = pd.DataFrame([\n",
+    "    {\n",
+    "        'charge': 2,\n",
+    "        'sequence': 'MACDESTYK',\n",
+    "    },\n",
+    "    {\n",
+    "        'charge': 3,\n",
+    "        'sequence': 'MACDESTYK',\n",
+    "    },\n",
+    "    {\n",
+    "        'charge': 2,\n",
+    "        'sequence': 'MNPQ',\n",
+    "    },\n",
+    "    {\n",
+    "        'charge': 3,\n",
+    "        'sequence': 'MNPQKKKKK',\n",
+    "    }\n",
+    "])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "100%|██████████| 2/2 [00:00<00:00, 17848.10it/s]"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "/Users/georgwallmann/Documents/git/alphabase/alphabase/protein/fasta.py:1384: UserWarning: 1 peptides could not be annotated\n",
+      "  return precursor_df.merge(peptide_df, on='sequence', how='left')\n"
+     ]
+    }
+   ],
+   "source": [
+    "output_precursor_df = fasta.annotate_precursor_df(precursor_df, protein_df)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "assert len(output_precursor_df) == len(precursor_df)\n",
+    "assert output_precursor_df['cardinality'].tolist() == [1, 1, 2, 0]\n",
+    "assert output_precursor_df['proteins'].tolist() == ['xx', 'xx', 'xx;yy', '']\n",
+    "assert output_precursor_df['genes'].tolist() == ['xx_HUMAN', 'xx_HUMAN', 'xx_HUMAN;yy_HUMAN', '']"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "alpha",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/requirements.txt b/requirements.txt
@@ -12,3 +12,4 @@ scikit-learn
 regex
 # pydivsufsort
 dask
+pyahocorasick