From 6bab44d0d960fd567a33775a17377971b4188f4b Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Mon, 8 Jan 2018 09:48:24 +0100
Subject: [PATCH 01/18] Update LICENSE

---
 LICENSE | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/LICENSE b/LICENSE
index 8dada3e..6424d06 100644
--- a/LICENSE
+++ b/LICENSE
@@ -186,7 +186,7 @@
       same "printed page" as the copyright notice for easier
       identification within third-party archives.
 
-   Copyright {yyyy} {name of copyright owner}
+   Copyright 2018 Prof. Dr. Bernhard Küster, Chair of Proteomics and Bioanalytics, Technical University of Munich
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.

From 30ff35ac1cd75e8fd457035577b1c4bc8f37f7e9 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Mon, 8 Jan 2018 10:19:00 +0100
Subject: [PATCH 02/18] Update LICENSE

---
 LICENSE | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/LICENSE b/LICENSE
index 6424d06..18d052b 100644
--- a/LICENSE
+++ b/LICENSE
@@ -175,17 +175,6 @@
 
    END OF TERMS AND CONDITIONS
 
-   APPENDIX: How to apply the Apache License to your work.
-
-      To apply the Apache License to your work, attach the following
-      boilerplate notice, with the fields enclosed by brackets "{}"
-      replaced with your own identifying information. (Don't include
-      the brackets!)  The text should be enclosed in the appropriate
-      comment syntax for the file format. We also recommend that a
-      file or class name and description of purpose be included on the
-      same "printed page" as the copyright notice for easier
-      identification within third-party archives.
-
    Copyright 2018 Prof. Dr. Bernhard Küster, Chair of Proteomics and Bioanalytics, Technical University of Munich
 
    Licensed under the Apache License, Version 2.0 (the "License");

From 1a97b309994a3cc55e576712f37446ba81f0dee3 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Thu, 29 Nov 2018 16:11:50 +0100
Subject: [PATCH 03/18] :sparkles: init

---
 Dockerfile               |  11 ++++
 Makefile                 |  44 +++++++++++++
 README.md                |  62 +++++++++++++++++-
 examples/output_msms.txt |   4 ++
 examples/peptidelist.csv |   4 ++
 prosit/__init__.py       |  12 ++++
 prosit/alignment.py      |  40 ++++++++++++
 prosit/annotate.py       |  43 +++++++++++++
 prosit/constants.py      | 112 ++++++++++++++++++++++++++++++++
 prosit/io.py             |  23 +++++++
 prosit/layers.py         |  96 +++++++++++++++++++++++++++
 prosit/losses.py         |  25 +++++++
 prosit/match.py          | 136 +++++++++++++++++++++++++++++++++++++++
 prosit/maxquant.py       | 119 ++++++++++++++++++++++++++++++++++
 prosit/model.py          |  55 ++++++++++++++++
 prosit/normalize.py      |   8 +++
 prosit/prediction.py     |  45 +++++++++++++
 prosit/sanitize.py       |  92 ++++++++++++++++++++++++++
 prosit/server.py         |  40 ++++++++++++
 prosit/tensorize.py      |  64 ++++++++++++++++++
 prosit/training.py       |  53 +++++++++++++++
 prosit/utils.py          |  39 +++++++++++
 setup.py                 |  14 ++++
 23 files changed, 1140 insertions(+), 1 deletion(-)
 create mode 100644 Dockerfile
 create mode 100644 Makefile
 create mode 100644 examples/output_msms.txt
 create mode 100644 examples/peptidelist.csv
 create mode 100644 prosit/__init__.py
 create mode 100644 prosit/alignment.py
 create mode 100644 prosit/annotate.py
 create mode 100644 prosit/constants.py
 create mode 100644 prosit/io.py
 create mode 100644 prosit/layers.py
 create mode 100644 prosit/losses.py
 create mode 100644 prosit/match.py
 create mode 100644 prosit/maxquant.py
 create mode 100644 prosit/model.py
 create mode 100644 prosit/normalize.py
 create mode 100644 prosit/prediction.py
 create mode 100644 prosit/sanitize.py
 create mode 100644 prosit/server.py
 create mode 100644 prosit/tensorize.py
 create mode 100644 prosit/training.py
 create mode 100644 prosit/utils.py
 create mode 100644 setup.py

diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..880278a
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,11 @@
+FROM tensorflow/tensorflow:1.10.1-gpu-py3
+RUN pip install keras==2.2.1 h5py tables flask
+
+ENV KERAS_BACKEND=tensorflow
+ENV TF_CPP_MIN_LOG_LEVEL=3
+ENV LC_ALL=C.UTF-8
+ENV LANG=C.UTF-8
+
+ADD prosit/ /root/prosit
+RUN cd /root/
+WORKDIR /root/
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..5b9ff83
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,44 @@
+DATA ?= $(HOME)/data.hdf5
+MODEL ?= $(HOME)/model/
+OUT_FOLDER ?= $(MODEL)
+HOSTPORT ?= 5000
+GPU ?= 0
+DOCKER = nvidia-docker
+IMAGE = prosit
+DOCKERFILE = Dockerfile
+
+
+build:
+	$(DOCKER) build -qf $(DOCKERFILE) -t $(IMAGE) .
+
+
+predict: build
+	$(DOCKER) run -it \
+	    -v "$(DATA)":/root/data.hdf5 \
+	    -v "$(MODEL)":/root/model/ \
+	    -v "$(OUT_FOLDER)":/root/prediction/ \
+	    -e CUDA_VISIBLE_DEVICES=$(GPU) \
+	    $(IMAGE) python3 -m prosit.prediction
+
+
+train: build
+	$(DOCKER) run -it \
+	    -v "$(DATA)":/root/data.hdf5 \
+	    -v "$(MODEL)":/root/model/ \
+	    -e CUDA_VISIBLE_DEVICES=$(GPU) \
+	    $(IMAGE) python3 -m prosit.training
+
+
+server: build
+	$(DOCKER) run -it \
+	    -v "$(MODEL)":/root/model/ \
+	    -e CUDA_VISIBLE_DEVICES=$(GPU) \
+	    -p $(HOSTPORT):5000 \
+	    $(IMAGE) python3 -m prosit.server
+
+jump: build
+	$(DOCKER) run -it \
+	    -v "$(MODEL)":/root/model/ \
+	    -v "$(DATA)":/root/data.hdf5 \
+	    -e CUDA_VISIBLE_DEVICES=$(GPU) \
+	    $(IMAGE) bash
diff --git a/README.md b/README.md
index b1b5509..fcf4709 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,65 @@
+
 # Prosit
 
-ProteomeTools Spectral predIction Tool
+Prosit is a deep neural network to predict iRT values and MS2 spectra for given peptide sequences. 
+You can use it at [proteomicsdb.org/prosit/](www.proteomicsdb.org/prosit/) without installation.
 
 [![CLA assistant](https://cla-assistant.io/readme/badge/kusterlab/prosit)](https://cla-assistant.io/kusterlab/prosit)
+
+## Hardware
+
+Prosit requires
+
+- a [GPU with CUDA support](https://developer.nvidia.com/cuda-gpus)
+
+
+## Installation
+
+Prosit requires
+
+- [Docker 17.05.0-ce](https://docs.docker.com/install/)
+- [nvidia-docker 2.0.3](https://github.com/NVIDIA/nvidia-docker) with CUDA 8.0 and CUDNN 6 or later installed
+- [make 4.1](https://www.gnu.org/software/make/)
+
+Prosit was tested on Ubuntu 16.04, CUDA 8.0, CUDNN 6 with Nvidia Tesla K40c and Titan Xp graphic cards with the dependencies above.
+
+The time installation takes is dependent on your download speed (Prosit downloads a 3GB docker container). In our tests installation time is ~5 minutes.
+
+## Model
+
+Prosit assumes your model to be in a directory that includes:
+
+- model.yml - a saved keras model
+- config.yml - a model specifying names of inputs and outputs of the model
+- weights file(s) - that follow the template `weights_{epoch}_{loss}.hdf5`
+
+You can download a pre-trained model for HCD fragmentation prediction on [figshare](https://figshare.com/account/home#/projects/35582).
+
+## Usage
+
+The following command will load your model from `/path/to/model/`.
+In the example GPU device 0 is used for computation. The default PORT is 5000.
+
+    make server MODEL=/path/to/model/
+
+    ## Example
+
+    Please find an example input file at `example/peptidelist.csv`. After starting the server you can run:
+
+        curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/
+
+        The example takes about 4s to run. An expected output file can be found at `examples/output_msms.txt`.
+
+        ## Using Prosit on your data
+
+        You can adjust the example above to your own needs. Send any list of (Peptide, Precursor charge, Collision energy) in the format of `/example/peptidelist.csv` to a running instance of the Prosit server.
+
+
+        ## Pseudo-code
+
+           1. Load the model given as MODEL environment variable
+              2. Start a server and wait for inputs
+                 3. On incomming request
+                        * transform peptide list to model input format (numpy arrays)
+                               * predict fragment intensity with loaded model for given peptides
+                                      * transform prediction to msms.txt output format and return response
diff --git a/examples/output_msms.txt b/examples/output_msms.txt
new file mode 100644
index 0000000..0d6628d
--- /dev/null
+++ b/examples/output_msms.txt
@@ -0,0 +1,4 @@
+Intensities	Masses	Matches	Modified Sequence	Charge
+1.0;0.412596;0.730622;0.304245;0.116402;0.0937164;0.0809876;0.192307;0.0859734;0.15234;0.0825881;0.0125858;0.0341731;0.0134653;0.0030653;0.00143102;0.00824674;0.00419197;0.000614032;0.00130399;0.0022207;0.000594303	175.118952167;322.154347167;435.238411167;548.322475167;619.359589167;690.396703167;761.433817167;263.088246467;360.141010467;431.178124467;502.215238467;573.252352467;218.122843817;429.746928817;132.047761467;180.574143467;216.092700467;251.611257467;207.124714034;230.803752034;254.482790034;229.450323134	y1;y2;y3;y4;y5;y6;y7;b2;b3;b4;b5;b6;y3(2+);y8(2+);b2(2+);b3(2+);b4(2+);b5(2+);y5(3+);y6(3+);y7(3+);b7(3+)	MMPAAALIM(ox)R	3
+0.0318386;0.0534038;0.000866649;0.165916;0.61992;1.0;0.37558;0.00687848;0.85537;0.186023;0.00952645;0.125391;0.666909;0.0563227;0.000254796;0.00419896	147.112804167;294.148199167;407.232263167;504.285027167;601.337791167;698.390555167;769.427669167;882.511733167;245.131825467;316.168939467;413.221703467;301.172533817;349.698915817;385.217472817;158.588107967;207.114489967	y1;y2;y3;y4;y5;y6;y7;y8;b2;b3;b4;y5(2+);y6(2+);y7(2+);b3(2+);b4(2+)	MLAPPPIM(ox)K	2
+0.647814;0.762372;1.0;0.785805;0.244676;0.0376843;0.985027;0.468406;0.305399;0.572137;0.273087;0.0259239;0.315776;0.694665;0.426082;0.00298714;0.0162049;0.0243926;0.00191334;0.027638;0.00831755;0.00103351	175.118952167;419.207111167;516.259875167;613.312639167;710.365403167;132.047761467;288.148872467;359.185986467;472.270050467;585.354114467;698.438178467;811.522242467;258.633575817;307.159957817;355.686339817;180.096631467;236.638663467;293.180695467;349.722727467;59.0445017003;237.459985367;44.6874381337	y1;y3;y4;y5;y6;b1;b2;b3;b4;b5;b6;b7;y4(2+);y5(2+);y6(2+);b3(2+);b4(2+);b5(2+);b6(2+);y1(3+);y6(3+);b1(3+)	MRALLLIPPPPM(ox)R	6
diff --git a/examples/peptidelist.csv b/examples/peptidelist.csv
new file mode 100644
index 0000000..daec5a1
--- /dev/null
+++ b/examples/peptidelist.csv
@@ -0,0 +1,4 @@
+modified_sequence,collision_energy,precursor_charge
+MMPAAALIM(ox)R,35,3
+MLAPPPIM(ox)K,30,2
+MRALLLIPPPPM(ox)R,30,6
diff --git a/prosit/__init__.py b/prosit/__init__.py
new file mode 100644
index 0000000..042f8f5
--- /dev/null
+++ b/prosit/__init__.py
@@ -0,0 +1,12 @@
+from . import io
+from . import constants
+from . import model
+from . import alignment
+from . import prediction
+from . import training
+from . import server
+from . import layers
+from . import sanitize
+
+
+__version__ = "1.0"
diff --git a/prosit/alignment.py b/prosit/alignment.py
new file mode 100644
index 0000000..c045335
--- /dev/null
+++ b/prosit/alignment.py
@@ -0,0 +1,40 @@
+import numpy
+
+from . import tensorize
+
+ACE_RANGE = list(range(18, 40, 1))
+
+
+def get_alignment_tensor(tensor, subset_size=10000):
+    mask_score = (tensor["score"] > 100).reshape(tensor["score"].shape[0])
+    mask_decoy = (tensor["reverse"] == False).reshape(tensor["score"].shape[0])
+    tm = {key: data[mask_score & mask_decoy] for key, data in tensor.items()}
+    if tm["score"].shape[0] < subset_size:
+        subset_idx = range(tm["score"].shape[0])
+    else:
+        idx = list(range(tm["intensities_raw"].shape[0]))
+        numpy.random.shuffle(idx)
+        subset_idx = idx[:10000]
+    alignment_tensors = {}
+    for cea in ACE_RANGE:
+        tmp = {k: d[subset_idx] for k, d in tm.items()}
+        tmp["collision_energy_aligned"] = tmp["collision_energy"] * 0 + cea
+        tmp["collision_energy_aligned_normed"] = tmp["collision_energy_aligned"] / 100.
+        alignment_tensors[cea] = tmp
+    alignment_tensor = tensorize.stack(alignment_tensors)
+    return alignment_tensor
+
+
+def get_ace_dist(tensor):
+    dist = {}
+    for ace in ACE_RANGE:
+        mask_ace = tensor["collision_energy_aligned_normed"] == ace / 100.
+        mask_ace = mask_ace.reshape(mask_ace.shape[0])
+        sa = numpy.median(tensor["spectral_angle"][mask_ace])
+        dist[int(ace)] = sa
+    return dist
+
+
+def get_ace(tensor):
+    dist = get_ace_dist(tensor)
+    return max(dist, key=dist.get)
diff --git a/prosit/annotate.py b/prosit/annotate.py
new file mode 100644
index 0000000..2fdd3c8
--- /dev/null
+++ b/prosit/annotate.py
@@ -0,0 +1,43 @@
+import numpy
+import collections
+from .constants import AMINO_ACID, PROTON, ION_OFFSET, FORWARD, BACKWARD
+from . import constants
+
+
+def adjust_masses(method):
+    if method == "SILAC":
+        offsets = {"K": 8.01419881319, "R": 10.008268599}
+    else:
+        raise ValueError("Don't know method: " + method)
+
+    for aa, offset in offsets.items():
+        AMINO_ACID[aa] += offset
+
+
+def get_mz(sum_, ion_offset, charge):
+    return (sum_ + ion_offset + charge * PROTON) / charge
+
+
+def get_mzs(cumsum, ion_type, z):
+    return [get_mz(s, ION_OFFSET[ion_type], z) for s in cumsum[:-1]]
+
+
+def get_annotation(forward, backward, charge, ion_types):
+    tmp = "{}{}"
+    tmp_nl = "{}{}-{}"
+    all_ = {}
+    for ion_type in ion_types:
+        if ion_type in constants.FORWARD:
+            cummass = forward
+        elif ion_type in constants.BACKWARD:
+            cummass = backward
+        else:
+            raise ValueError("unkown ion_type: {}".format(ion_type))
+        masses = get_mzs(cummass, ion_type, charge)
+        d = {tmp.format(ion_type, i + 1): m for i, m in enumerate(masses)}
+        all_.update(d)
+        for nl, offset in constants.NEUTRAL_LOSS.items():
+            nl_masses = get_mzs(cummass - offset, ion_type, charge)
+            d = {tmp_nl.format(ion_type, i + 1, nl): m for i, m in enumerate(nl_masses)}
+            all_.update(d)
+    return collections.OrderedDict(sorted(all_.items(), key=lambda t: t[0]))
diff --git a/prosit/constants.py b/prosit/constants.py
new file mode 100644
index 0000000..353450b
--- /dev/null
+++ b/prosit/constants.py
@@ -0,0 +1,112 @@
+DATA_PATH = "/root/data.hdf5"
+MODEL_DIR = "/root/model/"
+OUT_DIR = "/root/prediction/"
+
+VAL_SPLIT = 0.8
+
+TRAIN_EPOCHS = 500
+TRAIN_BATCH_SIZE = 1024
+PRED_BATCH_SIZE = 1024
+PRED_BAYES = False
+PRED_N = 100
+
+TOLERANCE_FTMS = 25
+TOLERANCE_ITMS = 0.35
+TOLERANCE_TRIPLETOF = 0.5
+
+TOLERANCE = {"FTMS": (25, "ppm"), "ITMS": (0.35, "da"), "TripleTOF": (50, "ppm")}
+
+
+ALPHABET = {
+    "A": 1,
+    "C": 2,
+    "D": 3,
+    "E": 4,
+    "F": 5,
+    "G": 6,
+    "H": 7,
+    "I": 8,
+    "K": 9,
+    "L": 10,
+    "M": 11,
+    "N": 12,
+    "P": 13,
+    "Q": 14,
+    "R": 15,
+    "S": 16,
+    "T": 17,
+    "V": 18,
+    "W": 19,
+    "Y": 20,
+    "M(ox)": 21,
+}
+ALPHABET_S = {integer: char for char, integer in ALPHABET.items()}
+
+CHARGES = [1, 2, 3, 4, 5, 6]
+DEFAULT_MAX_CHARGE = len(CHARGES)
+MAX_FRAG_CHARGE = 3
+MAX_SEQUENCE = 30
+MAX_ION = MAX_SEQUENCE - 1
+ION_TYPES = ["y", "b"]
+NLOSSES = ["", "H2O", "NH3"]
+
+FORWARD = {"a", "b", "c"}
+BACKWARD = {"x", "y", "z"}
+
+# Amino acids
+MODIFICATION = {
+    "CAM": 57.0214637236,  # Carbamidomethylation (CAM)
+    "OX": 15.99491,  # Oxidation
+}
+AMINO_ACID = {
+    "G": 57.021464,
+    "R": 156.101111,
+    "V": 99.068414,
+    "P": 97.052764,
+    "S": 87.032028,
+    "U": 150.95363,
+    "L": 113.084064,
+    "M": 131.040485,
+    "Q": 128.058578,
+    "N": 114.042927,
+    "Y": 163.063329,
+    "E": 129.042593,
+    "C": 103.009185 + MODIFICATION["CAM"],
+    "F": 147.068414,
+    "I": 113.084064,
+    "A": 71.037114,
+    "T": 101.047679,
+    "W": 186.079313,
+    "H": 137.058912,
+    "D": 115.026943,
+    "K": 128.094963,
+}
+AMINO_ACID["M(ox)"] = AMINO_ACID["M"] + MODIFICATION["OX"]
+
+# Atomic elements
+PROTON = 1.007276467
+ELECTRON = 0.00054858
+H = 1.007825035
+C = 12.0
+O = 15.99491463
+N = 14.003074
+
+# Tiny molecules
+N_TERMINUS = H
+C_TERMINUS = O + H
+CO = C + O
+CHO = C + H + O
+NH2 = N + H * 2
+H2O = H * 2 + O
+NH3 = N + H * 3
+
+NEUTRAL_LOSS = {"NH3": NH3, "H2O": H2O}
+
+ION_OFFSET = {
+    "a": N_TERMINUS - CHO,
+    "b": N_TERMINUS - H,
+    "c": N_TERMINUS + NH2,
+    "x": C_TERMINUS + CO - H,
+    "y": C_TERMINUS + H,
+    "z": C_TERMINUS - NH2,
+}
diff --git a/prosit/io.py b/prosit/io.py
new file mode 100644
index 0000000..ca20adc
--- /dev/null
+++ b/prosit/io.py
@@ -0,0 +1,23 @@
+
+from . import utils
+
+
+def get_array(tensor, keys):
+    utils.check_mandatory_keys(tensor, keys)
+    return [tensor[key] for key in keys]
+
+
+def to_hdf5(dictionary, path):
+    import h5py
+
+    with h5py.File(path, "w") as f:
+        for key, data in dictionary.items():
+            f.create_dataset(key, data=data, dtype=data.dtype, compression="gzip")
+
+
+def from_hdf5(path):
+    import h5py
+
+    with h5py.File(path, "r") as f:
+        data = {k: f[k][...] for k in f.keys()}
+    return data
diff --git a/prosit/layers.py b/prosit/layers.py
new file mode 100644
index 0000000..660fc1d
--- /dev/null
+++ b/prosit/layers.py
@@ -0,0 +1,96 @@
+from keras import backend as K
+from keras import regularizers, constraints, initializers, activations
+from keras.engine.topology import Layer
+
+
+class Attention(Layer):
+
+    def __init__(
+        self,
+        context=False,
+        W_regularizer=None,
+        b_regularizer=None,
+        u_regularizer=None,
+        W_constraint=None,
+        b_constraint=None,
+        u_constraint=None,
+        bias=True,
+        **kwargs
+    ):
+
+        self.supports_masking = True
+        self.init = initializers.get("glorot_uniform")
+        self.W_regularizer = regularizers.get(W_regularizer)
+        self.b_regularizer = regularizers.get(b_regularizer)
+        self.u_regularizer = regularizers.get(u_regularizer)
+        self.W_constraint = constraints.get(W_constraint)
+        self.b_constraint = constraints.get(b_constraint)
+        self.u_constraint = constraints.get(u_constraint)
+        self.bias = bias
+        self.context = context
+        super(Attention, self).__init__(**kwargs)
+
+    def build(self, input_shape):
+        assert len(input_shape) == 3
+        self.W = self.add_weight(
+            (input_shape[-1],),
+            initializer=self.init,
+            name="{}_W".format(self.name),
+            regularizer=self.W_regularizer,
+            constraint=self.W_constraint,
+        )
+        if self.bias:
+            self.b = self.add_weight(
+                (input_shape[1],),
+                initializer="zero",
+                name="{}_b".format(self.name),
+                regularizer=self.b_regularizer,
+                constraint=self.b_constraint,
+            )
+        else:
+            self.b = None
+        if self.context:
+            self.u = self.add_weight(
+                (input_shape[-1],),
+                initializer=self.init,
+                name="{}_u".format(self.name),
+                regularizer=self.u_regularizer,
+                constraint=self.u_constraint,
+            )
+
+        self.built = True
+
+    def compute_mask(self, input, input_mask=None):
+        return None
+
+    def call(self, x, mask=None):
+        a = K.squeeze(K.dot(x, K.expand_dims(self.W)), axis=-1)
+        if self.bias:
+            a += self.b
+        a = K.tanh(a)
+        if self.context:
+            a = K.squeeze(K.dot(x, K.expand_dims(self.u)), axis=-1)
+        a = K.exp(a)
+        if mask is not None:
+            a *= K.cast(mask, K.floatx())
+        a /= K.cast(K.sum(a, axis=1, keepdims=True) + K.epsilon(), K.floatx())
+        a = K.expand_dims(a)
+        weighted_input = x * a
+        return K.sum(weighted_input, axis=1)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape[0], input_shape[-1]
+
+    def get_config(self):
+        config = {
+            'bias': self.bias,
+            'context': self.context,
+            'W_regularizer': regularizers.serialize(self.W_regularizer),
+            'b_regularizer': regularizers.serialize(self.b_regularizer),
+            'u_regularizer': regularizers.serialize(self.u_regularizer),
+            'W_constraint': constraints.serialize(self.W_constraint),
+            'b_constraint': constraints.serialize(self.b_constraint),
+            'u_constraint': constraints.serialize(self.u_constraint),
+        }
+        base_config = super(Attention, self).get_config()
+        return dict(list(base_config.items()) + list(config.items()))
diff --git a/prosit/losses.py b/prosit/losses.py
new file mode 100644
index 0000000..0678ff3
--- /dev/null
+++ b/prosit/losses.py
@@ -0,0 +1,25 @@
+import numpy
+
+
+def masked_spectral_distance(true, pred):
+    import tensorflow
+    import keras.backend as k
+
+    epsilon = k.epsilon()
+    pred_masked = ((true + 1) * pred) / (true + 1 + epsilon)
+    true_masked = ((true + 1) * true) / (true + 1 + epsilon)
+    pred_norm = k.l2_normalize(true_masked, axis=-1)
+    true_norm = k.l2_normalize(pred_masked, axis=-1)
+    product = k.sum(pred_norm * true_norm, axis=1)
+    arccos = tensorflow.acos(product)
+    return 2 * arccos / numpy.pi
+
+
+losses = {"masked_spectral_distance": masked_spectral_distance}
+
+
+def get(loss_name):
+    if loss_name in losses:
+        return losses[loss_name]
+    else:
+        return loss_name
diff --git a/prosit/match.py b/prosit/match.py
new file mode 100644
index 0000000..6c94f9b
--- /dev/null
+++ b/prosit/match.py
@@ -0,0 +1,136 @@
+import numpy
+
+from . import annotate
+from . import constants
+
+
+def read_attribute(row, attribute):
+    if " " not in str(row[attribute]):
+        return []
+    else:
+        return [float(m) for m in row[attribute].split(" ")]
+
+
+def peptide_parser(p):
+    if p[0] == "(":
+        raise ValueError("sequence starts with '('")
+    n = len(p)
+    i = 0
+    while i < n:
+        if i < n - 3 and p[i + 1] == "(":
+            j = p[i + 2 :].index(")")
+            offset = i + j + 3
+            yield p[i:offset]
+            i = offset
+        else:
+            yield p[i]
+            i += 1
+
+
+def get_forward_backward(peptide):
+    amino_acids = peptide_parser(peptide)
+    masses = [constants.AMINO_ACID[a] for a in amino_acids]
+    forward = numpy.cumsum(masses)
+    backward = numpy.cumsum(list(reversed(masses)))
+    return forward, backward
+
+
+def get_tolerance(theoretical, mass_analyzer):
+    if mass_analyzer in constants.TOLERANCE:
+        tolerance, unit = constants.TOLERANCE[mass_analyzer]
+        if unit == "ppm":
+            return theoretical * float(tolerance) / 10 ** 6
+        elif unit == "da":
+            return float(tolerance)
+        else:
+            raise ValueError("unit {} not implemented".format(unit))
+    else:
+        raise ValueError("no tolerance implemented for {}".format(mass_analyzer))
+
+
+def is_in_tolerance(theoretical, observed, mass_analyzer):
+    mz_tolerance = get_tolerance(theoretical, mass_analyzer)
+    lower = observed - mz_tolerance
+    upper = observed + mz_tolerance
+    return theoretical >= lower and theoretical <= upper
+
+
+def binarysearch(masses_raw, theoretical, mass_analyzer):
+    lo, hi = 0, len(masses_raw) - 1
+    while lo <= hi:
+        mid = (lo + hi) // 2
+        if is_in_tolerance(theoretical, masses_raw[mid], mass_analyzer):
+            return mid
+        elif masses_raw[mid] < theoretical:
+            lo = mid + 1
+        elif theoretical < masses_raw[mid]:
+            hi = mid - 1
+    return None
+
+
+def match(row, ion_types, max_charge=constants.DEFAULT_MAX_CHARGE):
+    masses_observed = read_attribute(row, "masses_raw")
+    intensities_observed = read_attribute(row, "intensities_raw")
+    forward_sum, backward_sum = get_forward_backward(row.modified_sequence[1:-1])
+    _max_charge = row.charge if row.charge <= max_charge else max_charge
+    matches = []
+    for charge_index in range(_max_charge):
+        d = {
+            "masses_raw": [],
+            "masses_theoretical": [],
+            "intensities_raw": [],
+            "matches": [],
+        }
+        charge = charge_index + 1
+        annotations = annotate.get_annotation(
+            forward_sum, backward_sum, charge, ion_types
+        )
+        for annotation, mass_t in annotations.items():
+            index = binarysearch(masses_observed, mass_t, row.mass_analyzer)
+            if index is not None:
+                d["masses_raw"].append(masses_observed[index])
+                d["intensities_raw"].append(intensities_observed[index])
+                d["masses_theoretical"].append(mass_t)
+                d["matches"].append(annotation)
+        matches.append(d)
+    return matches
+
+
+def c_lambda(matches, charge, attr):
+
+    def mapping(i):
+        charge_index = int(charge - 1)
+        m = matches[i]
+        if charge_index < len(m):
+            try:
+                s = ";".join(map(str, m[charge_index][attr]))
+            except:
+                raise ValueError(m[charge_index][attr])
+        else:
+            s = ""
+        return s
+
+    return mapping
+
+
+def augment(df, ion_types, charge_max):
+    matches = {}
+    for i, row in df.iterrows():
+        matches[i] = match(row, ion_types, charge_max)
+
+    # augment dataframe and write
+    for charge in range(1, charge_max + 1):
+        df["matches_charge{}".format(charge)] = df.index.map(
+            c_lambda(matches, charge, "matches")
+        )
+        df["masses_the_charge{}".format(charge)] = df.index.map(
+            c_lambda(matches, charge, "masses_theoretical")
+        )
+        df["masses_raw_charge{}".format(charge)] = df.index.map(
+            c_lambda(matches, charge, "masses_raw")
+        )
+        df["intensities_raw_charge{}".format(charge)] = df.index.map(
+            c_lambda(matches, charge, "intensities_raw")
+        )
+
+    return df
diff --git a/prosit/maxquant.py b/prosit/maxquant.py
new file mode 100644
index 0000000..50124ce
--- /dev/null
+++ b/prosit/maxquant.py
@@ -0,0 +1,119 @@
+import pandas
+
+import os
+
+from . import constants
+from . import match
+from . import annotate
+from . import utils
+
+COL_SEP = "\t"
+
+
+def rename_column(attribute):
+    lower_no_spaces = attribute.lower().replace(" ", "_")
+    bad_chars = ".[]()/"
+    return lower_no_spaces.translate({ord(c): None for c in bad_chars})
+
+
+def read(filepath, low_memory=False):
+    COL_SEP = "\t"
+    CONVERTERS = {"Reverse": lambda r: True if r == "+" else False}
+    TYPES = {
+        "Type": "object",
+        "Masses": "object",
+        "Intensities": "object",
+        "Matches": "object",
+        "Sequence": "object",
+        "Modifications": "object",
+        "Modified sequence": "object",
+        "Raw file": "object",
+        "Score": float,
+        "Precursor Intensity": float,
+        "Mass": float,
+        "Mass Error [ppm]": float,
+        "Delta score": float,
+        "Peptide ID": int,
+        "Scan event number": int,
+        "Scan number": int,
+        "Charge": int,
+        "Reverse": bool,
+    }
+
+    # fix different maxquant formats
+    header = pandas.read_csv(filepath, nrows=1, sep=COL_SEP).columns
+    if "Mass Error [ppm]" not in header and "Mass error [ppm]" in header:
+        TYPES["Mass error [ppm]"] = TYPES.pop("Mass Error [ppm]")
+
+    df = pandas.read_csv(
+        filepath,
+        header="infer",
+        sep=COL_SEP,
+        usecols=TYPES.keys(),
+        dtype=TYPES,
+        converters=CONVERTERS,
+        low_memory=low_memory,
+    )
+    df.columns = list(map(rename_column, df.columns))
+    df = df[df.type.map(lambda x: x != "MULTI-SECPEP")]
+    df = df.set_index("scan_number", drop=False)
+    return df
+
+
+def write(df, filepath):
+    df.to_csv(filepath, sep=COL_SEP, index=False)
+
+
+def convert_prediction(tensor):
+    assert "intensities_pred" in tensor
+    assert "sequence_integer" in tensor
+    assert "precursor_charge_onehot" in tensor
+    intensities_pred = utils.reshape_dims(tensor["intensities_pred"])
+    modified_sequences = utils.sequence_integer_to_str(tensor["sequence_integer"])
+    natural_losses_max = 0
+
+    def convert_row(i):
+        modified_sequence = modified_sequences[i]
+        fw, bw = match.get_forward_backward(modified_sequence)
+        mzs = []
+        ions = []
+        intes = []
+        for fz in range(constants.MAX_FRAG_CHARGE):
+            ann = annotate.get_annotation(fw, bw, fz + 1, "yb")
+            for fty_i, fty in enumerate(constants.ION_TYPES):
+                for fi in range(constants.MAX_ION):
+                    ion = fty + str(fi + 1)
+                    inte = intensities_pred[i, fi, fty_i, natural_losses_max, fz]
+                    if inte > 0:
+                        mz = ann[ion]
+                        if fz > 0:
+                            ion += "({}+)".format(fz + 1)
+                        mzs.append(mz)
+                        ions.append(ion)
+                        intes.append(inte)
+                    else:
+                        continue
+
+        mzs_s = ";".join(map(str, mzs))
+        matches_s = ";".join(ions)
+        ints_s = ";".join(map(str, intes))
+        return mzs_s, matches_s, ints_s
+
+    masses_c = []
+    matches_c = []
+    ints_c = []
+    for i in range(len(modified_sequences)):
+        mzs, matches, ints = convert_row(i)
+        masses_c.append(mzs)
+        matches_c.append(matches)
+        ints_c.append(ints)
+    df = pandas.DataFrame(
+        {
+            "Matches": matches_c,
+            "Masses": masses_c,
+            "Intensities": ints_c,
+            "Modified Sequence": modified_sequences,
+        }
+    )
+    df["Charge"] = tensor["precursor_charge_onehot"].argmax(1) + 1
+    return df
diff --git a/prosit/model.py b/prosit/model.py
new file mode 100644
index 0000000..4fd1d38
--- /dev/null
+++ b/prosit/model.py
@@ -0,0 +1,55 @@
+import os
+import yaml
+
+from . import constants
+from . import layers
+from . import utils
+
+
+MODEL_NAME = "model.yml"
+CONFIG_NAME = "config.yml"
+
+
+def is_weight_name(w):
+    return w.startswith("weight_") and w.endswith(".hdf5")
+
+
+def get_loss(x):
+    return float(x.split("_")[-1][:-5])
+
+
+def get_best_weights_path(model_dir):
+    weights = list(filter(is_weight_name, os.listdir(model_dir)))
+    if len(weights) == 0:
+        return None
+    else:
+        d = {get_loss(w): w for w in weights}
+        weights_path = "{}/{}".format(model_dir, d[min(d)])
+        return weights_path
+
+
+def load(model_dir, trained=False):
+    import keras
+
+    model_path = os.path.join(model_dir, MODEL_NAME)
+    config_path = os.path.join(model_dir, CONFIG_NAME)
+    weights_path = get_best_weights_path(model_dir)
+    with open(config_path, "r") as f:
+        config = yaml.load(f)
+    with open(model_path, "r") as f:
+        model = keras.models.model_from_yaml(
+            f.read(), custom_objects={"Attention": layers.Attention}
+        )
+    if trained and weights_path is not None:
+        model.load_weights(weights_path)
+    return model, config
+
+
+def save(model, config, model_dir):
+    model_path = MODEL_NAME.format(model_dir)
+    config_path = CONFIG_NAME.format(model_dir)
+    utils.check_mandatory_keys(config, ["name", "optimizer", "loss", "x", "y"])
+    with open(config_path, "w") as f:
+        yaml.dump(config, f, default_flow_style=False)
+    with open(model_path, "w") as f:
+        f.write(model.to_yaml())
diff --git a/prosit/normalize.py b/prosit/normalize.py
new file mode 100644
index 0000000..a5a1e84
--- /dev/null
+++ b/prosit/normalize.py
@@ -0,0 +1,8 @@
+import numpy
+
+
+def base_peak(spectral):
+    max_int = spectral.max(1)
+    spectral = spectral / max_int[:, numpy.newaxis]
+    spectral = numpy.nan_to_num(spectral)
+    return spectral
diff --git a/prosit/prediction.py b/prosit/prediction.py
new file mode 100644
index 0000000..a07671c
--- /dev/null
+++ b/prosit/prediction.py
@@ -0,0 +1,45 @@
+import os
+
+from . import model as model_lib
+from . import io
+from . import constants
+from . import sanitize
+
+
+def predict(tensor, model, model_config, verbose=False):
+    import keras
+    # check for mandatory keys
+    x = io.get_array(tensor, model_config["x"])
+
+    model.compile(optimizer="adam", loss="mse")
+    prediction = model.predict(
+        x, verbose=verbose, batch_size=constants.PRED_BATCH_SIZE
+    )
+    if model_config["prediction_type"] == "intensity":
+        tensor["intensities_pred"] = prediction
+        tensor = sanitize.prediction(tensor)
+    elif model_config["prediction_type"] == "iRT":
+        import numpy as np
+        tensor["iRT"] = prediction * np.sqrt(float(model_config["iRT_rescaling_var"])) + float(model_config["iRT_rescaling_mean"])
+    else:
+        raise ValueError("model_config misses parameter")
+
+    return tensor
+
+
+if __name__ == "__main__":
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # turn off tf logging
+    data_path = constants.DATA_PATH
+    model_dir = constants.MODEL_DIR
+
+    weights_path = model_lib.get_best_weights_path(model_dir)
+    weights_name = weights_path.split("/")[-1][:-5]
+    data_name = data_path.split("/")[-1][:-5]
+    model, model_config = model_lib.load(model_dir, trained=True)
+    tensor = io.from_hdf5(data_path)
+    # tensor here is a dictionary
+    tensor = predict(tensor, model, model_config, verbose=True)
+
+    path = os.path.join(constants.OUT_DIR, "prediction.hdf5")
+    # tensor here is a dictionary
+    io.to_hdf5(tensor, path)
diff --git a/prosit/sanitize.py b/prosit/sanitize.py
new file mode 100644
index 0000000..567be8d
--- /dev/null
+++ b/prosit/sanitize.py
@@ -0,0 +1,92 @@
+import numpy
+import functools
+from .constants import *
+from . import losses
+
+
+def reshape_dims(array):
+    n, dims = array.shape
+    assert dims == 174
+    nlosses = 1
+    return array.reshape(
+        [array.shape[0], MAX_SEQUENCE - 1, len(ION_TYPES), nlosses, MAX_FRAG_CHARGE]
+    )
+
+
+def reshape_flat(array):
+    s = array.shape
+    flat_dim = [s[0], functools.reduce(lambda x, y: x * y, s[1:], 1)]
+    return array.reshape(flat_dim)
+
+
+def normalize_base_peak(array):
+    # flat
+    maxima = array.max(axis=1)
+    array = array / maxima[:, numpy.newaxis]
+    return array
+
+
+def mask_outofrange(array, lengths, mask=-1.):
+    # dim
+    for i in range(array.shape[0]):
+        array[i, lengths[i] - 1 :, :, :, :] = mask
+    return array
+
+
+def mask_outofcharge(array, charges, mask=-1.):
+    # dim
+    for i in range(array.shape[0]):
+        if charges[i] < 3:
+            array[i, :, :, :, charges[i] :] = mask
+    return array
+
+
+def get_spectral_angle(true, pred, batch_size=600):
+    import tensorflow
+
+    n = true.shape[0]
+    sa = numpy.zeros([n])
+
+    def iterate():
+        if n > batch_size:
+            for i in range(n // batch_size):
+                true_sample = true[i * batch_size : (i + 1) * batch_size]
+                pred_sample = pred[i * batch_size : (i + 1) * batch_size]
+                yield i, true_sample, pred_sample
+            i = n // batch_size
+            yield i, true[(i) * batch_size :], pred[(i) * batch_size :]
+        else:
+            yield 0, true, pred
+
+    for i, t_b, p_b in iterate():
+        tensorflow.reset_default_graph()
+        with tensorflow.Session() as s:
+            sa_graph = losses.masked_spectral_distance(t_b, p_b)
+            sa_b = 1 - s.run(sa_graph)
+            sa[i * batch_size : i * batch_size + sa_b.shape[0]] = sa_b
+    sa = numpy.nan_to_num(sa)
+    return sa
+
+
+def prediction(data, batch_size=600):
+    assert "sequence_integer" in data
+    assert "intensities_pred" in data
+    assert "precursor_charge_onehot" in data
+
+    sequence_lengths = numpy.count_nonzero(data["sequence_integer"], axis=1)
+    intensities = data["intensities_pred"]
+    charges = list(data["precursor_charge_onehot"].argmax(axis=1) + 1)
+
+    intensities[intensities < 0] = 0
+    intensities = normalize_base_peak(intensities)
+    intensities = reshape_dims(intensities)
+    intensities = mask_outofrange(intensities, sequence_lengths)
+    intensities = mask_outofcharge(intensities, charges)
+    intensities = reshape_flat(intensities)
+    data["intensities_pred"] = intensities
+
+    if "intensities_raw" in data:
+        data["spectral_angle"] = get_spectral_angle(
+            data["intensities_raw"], data["intensities_pred"], batch_size=batch_size
+        )
+    return data
diff --git a/prosit/server.py b/prosit/server.py
new file mode 100644
index 0000000..4799c3f
--- /dev/null
+++ b/prosit/server.py
@@ -0,0 +1,40 @@
+import os
+import numpy
+import flask
+import pandas
+
+
+from . import model as model_lib
+from . import io
+from . import constants
+from . import tensorize
+from . import prediction
+from . import alignment
+from . import maxquant
+
+
+app = flask.Flask(__name__)
+
+
+@app.route("/")
+def hello():
+    return "prosit!\n"
+
+
+@app.route("/predict/", methods=["POST"])
+def predict():
+    df = pandas.read_csv(flask.request.files["peptides"])
+    tensor = tensorize.peptidelist(df)
+    result = prediction.predict(tensor, model, model_config)
+    df_pred = maxquant.convert_prediction(result)
+    path = "{}prediction.csv".format(model_dir)
+    maxquant.write(df_pred, path)
+    return flask.send_file(path)
+
+
+if __name__ == "__main__":
+    model_dir = constants.MODEL_DIR
+    global model
+    global model_config
+    model, model_config = model_lib.load(model_dir, trained=True)
+    app.run(host="0.0.0.0")
diff --git a/prosit/tensorize.py b/prosit/tensorize.py
new file mode 100644
index 0000000..cebc034
--- /dev/null
+++ b/prosit/tensorize.py
@@ -0,0 +1,64 @@
+import collections
+import numpy
+
+from . import constants
+from . import utils
+
+
+# helpers
+
+
+def stack(queue):
+    listed = collections.defaultdict(list)
+    for t in queue.values():
+        if t is not None:
+            for k, d in t.items():
+                listed[k].append(d)
+    stacked = {}
+    for k, d in listed.items():
+        if isinstance(d[0], list):
+            stacked[k] = [item for sublist in d for item in sublist]
+        else:
+            stacked[k] = numpy.vstack(d)
+    return stacked
+
+
+def get_numbers(vals, dtype=float):
+    a = numpy.array(vals).astype(dtype)
+    return a.reshape([len(vals), 1])
+
+
+def get_precursor_charge_onehot(charges):
+    array = numpy.zeros([len(charges), max(constants.CHARGES)], dtype=int)
+    for i, precursor_charge in enumerate(charges):
+        array[i, precursor_charge - 1] = 1
+    return array
+
+
+def get_sequence_integer(sequences):
+    array = numpy.zeros([len(sequences), constants.MAX_SEQUENCE], dtype=int)
+    for i, sequence in enumerate(sequences):
+        for j, s in enumerate(utils.peptide_parser(sequence)):
+            array[i, j] = constants.ALPHABET[s]
+    return array
+
+
+# file types
+
+
+def peptidelist(df):
+    df.reset_index(drop=True, inplace=True)
+    assert "modified_sequence" in df.columns
+    assert "collision_energy" in df.columns
+    assert "precursor_charge" in df.columns
+    tensor = {
+        "collision_energy_aligned_normed": get_numbers(df.collision_energy) / 100.,
+        "sequence_integer": get_sequence_integer(df.modified_sequence),
+        "precursor_charge_onehot": get_precursor_charge_onehot(df.precursor_charge),
+    }
+    return tensor
+
+
+def msms_txt(df):
+    # TODO: implement
+    pass
diff --git a/prosit/training.py b/prosit/training.py
new file mode 100644
index 0000000..c70d55c
--- /dev/null
+++ b/prosit/training.py
@@ -0,0 +1,53 @@
+import os
+
+from . import io
+from . import losses
+from . import model as model_lib
+from . import constants
+
+
+def get_callbacks(model_dir_path):
+    import keras
+
+    loss_format = "{val_loss:.5f}"
+    epoch_format = "{epoch:02d}"
+    weights_file = "{}/weight_{}_{}.hdf5".format(
+        model_dir_path, epoch_format, loss_format
+    )
+    save = keras.callbacks.ModelCheckpoint(weights_file, save_best_only=True)
+    stop = keras.callbacks.EarlyStopping(patience=10)
+    decay = keras.callbacks.ReduceLROnPlateau(patience=2, factor=0.2)
+    return [save, stop, decay]
+
+
+def train(tensor, model, model_config, callbacks):
+    import keras
+
+    if isinstance(model_config["loss"], list):
+        loss = [losses.get(l) for l in model_config["loss"]]
+    else:
+        loss = losses.get(model_config["loss"])
+    optimizer = model_config["optimizer"]
+    x = io.get_array(tensor, model_config["x"])
+    y = io.get_array(tensor, model_config["y"])
+    model.compile(optimizer=optimizer, loss=loss)
+    model.fit(
+        x=x,
+        y=y,
+        epochs=constants.TRAIN_EPOCHS,
+        batch_size=constants.TRAIN_BATCH_SIZE,
+        validation_split=1 - constants.VAL_SPLIT,
+        callbacks=callbacks,
+    )
+    keras.backend.get_session().close()
+
+
+if __name__ == "__main__":
+    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # turn off tf logging
+    data_path = constants.DATA_PATH
+    model_dir = constants.MODEL_DIR
+
+    model, model_config = model_lib.load(model_dir, trained=True)
+    tensor = io.from_hdf5(data_path)
+    callbacks = get_callbacks(model_dir)
+    train(tensor, model, model_config, callbacks)
diff --git a/prosit/utils.py b/prosit/utils.py
new file mode 100644
index 0000000..d1dde97
--- /dev/null
+++ b/prosit/utils.py
@@ -0,0 +1,39 @@
+from .constants import MAX_ION, ION_TYPES, ALPHABET_S
+
+
+def check_mandatory_keys(dictionary, keys):
+    for key in keys:
+        if key not in dictionary.keys():
+            raise KeyError("key {} is missing".format(key))
+    return True
+
+
+def reshape_dims(array, nlosses=1, z=3):
+    return array.reshape([array.shape[0], MAX_ION, len(ION_TYPES), nlosses, z])
+
+
+def get_sequence(sequence):
+    d = ALPHABET_S
+    return "".join([d[i] if i in d else "" for i in sequence])
+
+
+def sequence_integer_to_str(array):
+    sequences = [get_sequence(array[i]) for i in range(array.shape[0])]
+    return sequences
+
+
+def peptide_parser(p):
+    p = p.replace("_", "")
+    if p[0] == "(":
+        raise ValueError("sequence starts with '('")
+    n = len(p)
+    i = 0
+    while i < n:
+        if i < n - 3 and p[i + 1] == "(":
+            j = p[i + 2 :].index(")")
+            offset = i + j + 3
+            yield p[i:offset]
+            i = offset
+        else:
+            yield p[i]
+            i += 1
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..424e41a
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,14 @@
+from setuptools import setup
+
+setup(
+    name="prosit",
+    version="1.0",
+    description="prediction",
+    url="http://github.com/kusterlab/prosit",
+    author="Siegfried Gessulat",
+    author_email="s.gessulat@gmail.com",
+    packages=["prosit"],
+    zip_safe=False,
+    setup_requires=["pytest-runner"],
+    tests_require=["pytest", "pylint"],
+)

From a0d86b9f9ad94bedc901f14ed1efa3b0fac475d2 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Thu, 29 Nov 2018 16:12:59 +0100
Subject: [PATCH 04/18] :pencil: fix README

---
 README.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index fcf4709..3ccb4ca 100644
--- a/README.md
+++ b/README.md
@@ -55,11 +55,11 @@ In the example GPU device 0 is used for computation. The default PORT is 5000.
         You can adjust the example above to your own needs. Send any list of (Peptide, Precursor charge, Collision energy) in the format of `/example/peptidelist.csv` to a running instance of the Prosit server.
 
 
-        ## Pseudo-code
-
-           1. Load the model given as MODEL environment variable
-              2. Start a server and wait for inputs
-                 3. On incomming request
-                        * transform peptide list to model input format (numpy arrays)
-                               * predict fragment intensity with loaded model for given peptides
-                                      * transform prediction to msms.txt output format and return response
+    ## Pseudo-code
+
+    1. Load the model given as MODEL environment variable
+    2. Start a server and wait for inputs
+    3. On incomming request
+        * transform peptide list to model input format (numpy arrays)
+        * predict fragment intensity with loaded model for given peptides
+        * transform prediction to msms.txt output format and return response

From 4013c83d2a99efe888b9c03a3b1d071638fb9bf6 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Thu, 29 Nov 2018 16:14:25 +0100
Subject: [PATCH 05/18] :pencil: update README

---
 README.md | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

diff --git a/README.md b/README.md
index 3ccb4ca..ec78841 100644
--- a/README.md
+++ b/README.md
@@ -42,24 +42,24 @@ In the example GPU device 0 is used for computation. The default PORT is 5000.
 
     make server MODEL=/path/to/model/
 
-    ## Example
+## Example
 
-    Please find an example input file at `example/peptidelist.csv`. After starting the server you can run:
+Please find an example input file at `example/peptidelist.csv`. After starting the server you can run:
 
-        curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/
+    curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/
 
-        The example takes about 4s to run. An expected output file can be found at `examples/output_msms.txt`.
+    The example takes about 4s to run. An expected output file can be found at `examples/output_msms.txt`.
 
-        ## Using Prosit on your data
+## Using Prosit on your data
 
-        You can adjust the example above to your own needs. Send any list of (Peptide, Precursor charge, Collision energy) in the format of `/example/peptidelist.csv` to a running instance of the Prosit server.
+You can adjust the example above to your own needs. Send any list of (Peptide, Precursor charge, Collision energy) in the format of `/example/peptidelist.csv` to a running instance of the Prosit server.
 
 
-    ## Pseudo-code
+## Pseudo-code
 
-    1. Load the model given as MODEL environment variable
-    2. Start a server and wait for inputs
-    3. On incomming request
-        * transform peptide list to model input format (numpy arrays)
-        * predict fragment intensity with loaded model for given peptides
-        * transform prediction to msms.txt output format and return response
+1. Load the model given as MODEL environment variable
+2. Start a server and wait for inputs
+3. On incomming request
+    * transform peptide list to model input format (numpy arrays)
+    * predict fragment intensity with loaded model for given peptides
+    * transform prediction to msms.txt output format and return response

From 7a36c78901e2b96cc90cc333f81ffbc9caa6f1fb Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Thu, 29 Nov 2018 16:15:49 +0100
Subject: [PATCH 06/18] :pencil: fix proteomicsdb.org link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ec78841..ccfeced 100644
--- a/README.md
+++ b/README.md
@@ -2,7 +2,7 @@
 # Prosit
 
 Prosit is a deep neural network to predict iRT values and MS2 spectra for given peptide sequences. 
-You can use it at [proteomicsdb.org/prosit/](www.proteomicsdb.org/prosit/) without installation.
+You can use it at [proteomicsdb.org/prosit/](http://www.proteomicsdb.org/prosit/) without installation.
 
 [![CLA assistant](https://cla-assistant.io/readme/badge/kusterlab/prosit)](https://cla-assistant.io/kusterlab/prosit)
 

From a055a7e4127f7b5b9ebced4d3111b9ec521c493e Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Mon, 3 Dec 2018 17:46:38 +0100
Subject: [PATCH 07/18] :books: add hint

---
 prosit/losses.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/prosit/losses.py b/prosit/losses.py
index 0678ff3..06ed1b7 100644
--- a/prosit/losses.py
+++ b/prosit/losses.py
@@ -2,6 +2,7 @@
 
 
 def masked_spectral_distance(true, pred):
+    # Note, fragment ions that cannot exists (i.e. y20 for a 7mer) must have the value  -1.
     import tensorflow
     import keras.backend as k
 

From f054a68b0d39cebd280cc1d0f8c125265f85ff30 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Tue, 5 Mar 2019 08:45:07 +0100
Subject: [PATCH 08/18] updated figshare link

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index ccfeced..9e5dc08 100644
--- a/README.md
+++ b/README.md
@@ -33,7 +33,7 @@ Prosit assumes your model to be in a directory that includes:
 - config.yml - a model specifying names of inputs and outputs of the model
 - weights file(s) - that follow the template `weights_{epoch}_{loss}.hdf5`
 
-You can download a pre-trained model for HCD fragmentation prediction on [figshare](https://figshare.com/account/home#/projects/35582).
+You can download a pre-trained model for HCD fragmentation prediction on https://figshare.com/projects/Prosit/35582.
 
 ## Usage
 

From 53dd545cd4716fe59b13c00dcccd37d1dbc51977 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Mon, 24 Jun 2019 10:17:31 +0200
Subject: [PATCH 09/18] :books: adds info on unsupported amino acids

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 9e5dc08..6edb379 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,7 @@ Please find an example input file at `example/peptidelist.csv`. After starting t
 
 You can adjust the example above to your own needs. Send any list of (Peptide, Precursor charge, Collision energy) in the format of `/example/peptidelist.csv` to a running instance of the Prosit server.
 
+Please note: Sequences with amino acid U, O, or X are not supported. Modifications except "M(ox)" are not supported. Each C is treated as Cysteine with carbamidomethylation (fixed modification in MaxQuant).
 
 ## Pseudo-code
 

From 7748af72558d1cfabd221a7dace19f6a714aa5a6 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Fri, 28 Jun 2019 21:01:20 +0200
Subject: [PATCH 10/18] :lipstick: :sparkles: integrate iRT model

---
 Makefile             | 26 ++++----------
 prosit/alignment.py  |  4 +--
 prosit/constants.py  |  3 +-
 prosit/io.py         |  1 -
 prosit/layers.py     | 17 +++++----
 prosit/match.py      |  1 -
 prosit/prediction.py | 53 ++++++++++------------------
 prosit/sanitize.py   |  8 +++--
 prosit/server.py     | 62 ++++++++++++++++++++++++---------
 prosit/tensorize.py  | 83 ++++++++++++++++++++++++++++++++++----------
 10 files changed, 153 insertions(+), 105 deletions(-)

diff --git a/Makefile b/Makefile
index 5b9ff83..ac1dc26 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 DATA ?= $(HOME)/data.hdf5
-MODEL ?= $(HOME)/model/
+MODEL_SPECTRA ?= $(HOME)/model_sectra/
+MODEL_IRT ?= $(HOME)/model_irt/
 OUT_FOLDER ?= $(MODEL)
 HOSTPORT ?= 5000
 GPU ?= 0
@@ -12,33 +13,18 @@ build:
 	$(DOCKER) build -qf $(DOCKERFILE) -t $(IMAGE) .
 
 
-predict: build
-	$(DOCKER) run -it \
-	    -v "$(DATA)":/root/data.hdf5 \
-	    -v "$(MODEL)":/root/model/ \
-	    -v "$(OUT_FOLDER)":/root/prediction/ \
-	    -e CUDA_VISIBLE_DEVICES=$(GPU) \
-	    $(IMAGE) python3 -m prosit.prediction
-
-
-train: build
-	$(DOCKER) run -it \
-	    -v "$(DATA)":/root/data.hdf5 \
-	    -v "$(MODEL)":/root/model/ \
-	    -e CUDA_VISIBLE_DEVICES=$(GPU) \
-	    $(IMAGE) python3 -m prosit.training
-
-
 server: build
 	$(DOCKER) run -it \
-	    -v "$(MODEL)":/root/model/ \
+	    -v "$(MODEL_SPECTRA)":/root/model_spectra/ \
+	    -v "$(MODEL_IRT)":/root/model_irt/ \
 	    -e CUDA_VISIBLE_DEVICES=$(GPU) \
 	    -p $(HOSTPORT):5000 \
 	    $(IMAGE) python3 -m prosit.server
 
 jump: build
 	$(DOCKER) run -it \
-	    -v "$(MODEL)":/root/model/ \
+	    -v "$(MODEL_SPECTRA)":/root/model_spectra/ \
+	    -v "$(MODEL_IRT)":/root/model_irt/ \
 	    -v "$(DATA)":/root/data.hdf5 \
 	    -e CUDA_VISIBLE_DEVICES=$(GPU) \
 	    $(IMAGE) bash
diff --git a/prosit/alignment.py b/prosit/alignment.py
index c045335..b05de70 100644
--- a/prosit/alignment.py
+++ b/prosit/alignment.py
@@ -19,7 +19,7 @@ def get_alignment_tensor(tensor, subset_size=10000):
     for cea in ACE_RANGE:
         tmp = {k: d[subset_idx] for k, d in tm.items()}
         tmp["collision_energy_aligned"] = tmp["collision_energy"] * 0 + cea
-        tmp["collision_energy_aligned_normed"] = tmp["collision_energy_aligned"] / 100.
+        tmp["collision_energy_aligned_normed"] = tmp["collision_energy_aligned"] / 100.0
         alignment_tensors[cea] = tmp
     alignment_tensor = tensorize.stack(alignment_tensors)
     return alignment_tensor
@@ -28,7 +28,7 @@ def get_alignment_tensor(tensor, subset_size=10000):
 def get_ace_dist(tensor):
     dist = {}
     for ace in ACE_RANGE:
-        mask_ace = tensor["collision_energy_aligned_normed"] == ace / 100.
+        mask_ace = tensor["collision_energy_aligned_normed"] == ace / 100.0
         mask_ace = mask_ace.reshape(mask_ace.shape[0])
         sa = numpy.median(tensor["spectral_angle"][mask_ace])
         dist[int(ace)] = sa
diff --git a/prosit/constants.py b/prosit/constants.py
index 353450b..0a4c474 100644
--- a/prosit/constants.py
+++ b/prosit/constants.py
@@ -1,5 +1,6 @@
 DATA_PATH = "/root/data.hdf5"
-MODEL_DIR = "/root/model/"
+MODEL_SPECTRA = "/root/model_spectra/"
+MODEL_IRT = "/root/model_irt/"
 OUT_DIR = "/root/prediction/"
 
 VAL_SPLIT = 0.8
diff --git a/prosit/io.py b/prosit/io.py
index ca20adc..a28977d 100644
--- a/prosit/io.py
+++ b/prosit/io.py
@@ -1,4 +1,3 @@
-
 from . import utils
 
 
diff --git a/prosit/layers.py b/prosit/layers.py
index 660fc1d..9e642f9 100644
--- a/prosit/layers.py
+++ b/prosit/layers.py
@@ -4,7 +4,6 @@
 
 
 class Attention(Layer):
-
     def __init__(
         self,
         context=False,
@@ -83,14 +82,14 @@ def compute_output_shape(self, input_shape):
 
     def get_config(self):
         config = {
-            'bias': self.bias,
-            'context': self.context,
-            'W_regularizer': regularizers.serialize(self.W_regularizer),
-            'b_regularizer': regularizers.serialize(self.b_regularizer),
-            'u_regularizer': regularizers.serialize(self.u_regularizer),
-            'W_constraint': constraints.serialize(self.W_constraint),
-            'b_constraint': constraints.serialize(self.b_constraint),
-            'u_constraint': constraints.serialize(self.u_constraint),
+            "bias": self.bias,
+            "context": self.context,
+            "W_regularizer": regularizers.serialize(self.W_regularizer),
+            "b_regularizer": regularizers.serialize(self.b_regularizer),
+            "u_regularizer": regularizers.serialize(self.u_regularizer),
+            "W_constraint": constraints.serialize(self.W_constraint),
+            "b_constraint": constraints.serialize(self.b_constraint),
+            "u_constraint": constraints.serialize(self.u_constraint),
         }
         base_config = super(Attention, self).get_config()
         return dict(list(base_config.items()) + list(config.items()))
diff --git a/prosit/match.py b/prosit/match.py
index 6c94f9b..e8177a6 100644
--- a/prosit/match.py
+++ b/prosit/match.py
@@ -97,7 +97,6 @@ def match(row, ion_types, max_charge=constants.DEFAULT_MAX_CHARGE):
 
 
 def c_lambda(matches, charge, attr):
-
     def mapping(i):
         charge_index = int(charge - 1)
         m = matches[i]
diff --git a/prosit/prediction.py b/prosit/prediction.py
index a07671c..10360d4 100644
--- a/prosit/prediction.py
+++ b/prosit/prediction.py
@@ -1,4 +1,6 @@
 import os
+import keras
+import numpy as np
 
 from . import model as model_lib
 from . import io
@@ -6,40 +8,23 @@
 from . import sanitize
 
 
-def predict(tensor, model, model_config, verbose=False):
-    import keras
+def predict(data, d_model):
     # check for mandatory keys
-    x = io.get_array(tensor, model_config["x"])
-
-    model.compile(optimizer="adam", loss="mse")
-    prediction = model.predict(
-        x, verbose=verbose, batch_size=constants.PRED_BATCH_SIZE
-    )
-    if model_config["prediction_type"] == "intensity":
-        tensor["intensities_pred"] = prediction
-        tensor = sanitize.prediction(tensor)
-    elif model_config["prediction_type"] == "iRT":
-        import numpy as np
-        tensor["iRT"] = prediction * np.sqrt(float(model_config["iRT_rescaling_var"])) + float(model_config["iRT_rescaling_mean"])
+    x = io.get_array(data, d_model["config"]["x"])
+
+    keras.backend.set_session(d_model["session"])
+    with d_model["graph"].as_default():
+        prediction = d_model["model"].predict(
+            x, verbose=True, batch_size=constants.PRED_BATCH_SIZE
+        )
+
+    if d_model["config"]["prediction_type"] == "intensity":
+        data["intensities_pred"] = prediction
+        data = sanitize.prediction(data)
+    elif d_model["config"]["prediction_type"] == "iRT":
+        scal = float(d_model["config"]["iRT_rescaling_var"])
+        mean = float(d_model["config"]["iRT_rescaling_mean"])
+        data["iRT"] = prediction * np.sqrt(scal) + mean
     else:
         raise ValueError("model_config misses parameter")
-
-    return tensor
-
-
-if __name__ == "__main__":
-    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # turn off tf logging
-    data_path = constants.DATA_PATH
-    model_dir = constants.MODEL_DIR
-
-    weights_path = model_lib.get_best_weights_path(model_dir)
-    weights_name = weights_path.split("/")[-1][:-5]
-    data_name = data_path.split("/")[-1][:-5]
-    model, model_config = model_lib.load(model_dir, trained=True)
-    tensor = io.from_hdf5(data_path)
-    # tensor here is a dictionary
-    tensor = predict(tensor, model, model_config, verbose=True)
-
-    path = os.path.join(constants.OUT_DIR, "prediction.hdf5")
-    # tensor here is a dictionary
-    io.to_hdf5(tensor, path)
+    return data
diff --git a/prosit/sanitize.py b/prosit/sanitize.py
index 567be8d..31dc5f8 100644
--- a/prosit/sanitize.py
+++ b/prosit/sanitize.py
@@ -26,14 +26,18 @@ def normalize_base_peak(array):
     return array
 
 
-def mask_outofrange(array, lengths, mask=-1.):
+def mask_outofrange(array, lengths, mask=-1.0):
     # dim
     for i in range(array.shape[0]):
         array[i, lengths[i] - 1 :, :, :, :] = mask
     return array
 
 
-def mask_outofcharge(array, charges, mask=-1.):
+def cap(array, nlosses=1, z=3):
+    return array[:, :, :, :nlosses, :z]
+
+
+def mask_outofcharge(array, charges, mask=-1.0):
     # dim
     for i in range(array.shape[0]):
         if charges[i] < 3:
diff --git a/prosit/server.py b/prosit/server.py
index 4799c3f..1014682 100644
--- a/prosit/server.py
+++ b/prosit/server.py
@@ -1,10 +1,12 @@
 import os
-import numpy
+import tempfile
+import warnings
 import flask
-import pandas
+from flask import after_this_request
+import pandas as pd
+import tensorflow as tf
 
-
-from . import model as model_lib
+from . import model
 from . import io
 from . import constants
 from . import tensorize
@@ -21,20 +23,48 @@ def hello():
     return "prosit!\n"
 
 
-@app.route("/predict/", methods=["POST"])
-def predict():
-    df = pandas.read_csv(flask.request.files["peptides"])
-    tensor = tensorize.peptidelist(df)
-    result = prediction.predict(tensor, model, model_config)
+def predict(csv):
+    df = pd.read_csv(csv)
+    data = tensorize.csv(df)
+    data = prediction.predict(data, d_spectra)
+    data = prediction.predict(data, d_irt)
+    return data
+
+
+@app.route("/predict/msms", methods=["POST"])
+def return_msms():
+    result = predict(flask.request.files["peptides"])
     df_pred = maxquant.convert_prediction(result)
-    path = "{}prediction.csv".format(model_dir)
-    maxquant.write(df_pred, path)
-    return flask.send_file(path)
+    tmp_f = tempfile.NamedTemporaryFile(delete=True)
+    maxquant.write(df_pred, tmp_f.name)
+
+    @after_this_request
+    def cleanup(response):
+        tmp_f.close()
+        return response
+
+    return flask.send_file(tmp_f.name)
 
 
 if __name__ == "__main__":
-    model_dir = constants.MODEL_DIR
-    global model
-    global model_config
-    model, model_config = model_lib.load(model_dir, trained=True)
+    warnings.filterwarnings("ignore")
+    global d_spectra
+    global d_irt
+    d_spectra = {}
+    d_irt = {}
+
+    d_spectra["graph"] = tf.Graph()
+    with d_spectra["graph"].as_default():
+        d_spectra["session"] = tf.Session()
+        with d_spectra["session"].as_default():
+            d_spectra["model"], d_spectra["config"] = model.load(
+                constants.MODEL_SPECTRA
+            )
+            d_spectra["model"].compile(optimizer="adam", loss="mse")
+    d_irt["graph"] = tf.Graph()
+    with d_irt["graph"].as_default():
+        d_irt["session"] = tf.Session()
+        with d_irt["session"].as_default():
+            d_irt["model"], d_irt["config"] = model.load(constants.MODEL_IRT)
+            d_irt["model"].compile(optimizer="adam", loss="mse")
     app.run(host="0.0.0.0")
diff --git a/prosit/tensorize.py b/prosit/tensorize.py
index cebc034..d4eefdb 100644
--- a/prosit/tensorize.py
+++ b/prosit/tensorize.py
@@ -1,11 +1,21 @@
 import collections
-import numpy
+import numpy as np
 
 from . import constants
 from . import utils
-
-
-# helpers
+from . import match
+from . import annotate
+from . import sanitize
+from .constants import (
+    CHARGES,
+    MAX_SEQUENCE,
+    ALPHABET,
+    MAX_ION,
+    NLOSSES,
+    CHARGES,
+    ION_TYPES,
+    ION_OFFSET,
+)
 
 
 def stack(queue):
@@ -19,46 +29,81 @@ def stack(queue):
         if isinstance(d[0], list):
             stacked[k] = [item for sublist in d for item in sublist]
         else:
-            stacked[k] = numpy.vstack(d)
+            stacked[k] = np.vstack(d)
     return stacked
 
 
 def get_numbers(vals, dtype=float):
-    a = numpy.array(vals).astype(dtype)
+    a = np.array(vals).astype(dtype)
     return a.reshape([len(vals), 1])
 
 
 def get_precursor_charge_onehot(charges):
-    array = numpy.zeros([len(charges), max(constants.CHARGES)], dtype=int)
+    array = np.zeros([len(charges), max(CHARGES)], dtype=int)
     for i, precursor_charge in enumerate(charges):
         array[i, precursor_charge - 1] = 1
     return array
 
 
 def get_sequence_integer(sequences):
-    array = numpy.zeros([len(sequences), constants.MAX_SEQUENCE], dtype=int)
+    array = np.zeros([len(sequences), MAX_SEQUENCE], dtype=int)
     for i, sequence in enumerate(sequences):
         for j, s in enumerate(utils.peptide_parser(sequence)):
-            array[i, j] = constants.ALPHABET[s]
+            array[i, j] = ALPHABET[s]
     return array
 
 
-# file types
+def parse_ion(string):
+    ion_type = ION_TYPES.index(string[0])
+    if ("-") in string:
+        ion_n, suffix = string[1:].split("-")
+    else:
+        ion_n = string[1:]
+        suffix = ""
+    return ion_type, int(ion_n) - 1, NLOSSES.index(suffix)
+
+
+def get_mz_applied(df, ion_types="yb"):
+    ito = {it: ION_OFFSET[it] for it in ion_types}
 
+    def calc_row(row):
+        array = np.zeros([MAX_ION, len(ION_TYPES), len(NLOSSES), len(CHARGES)])
+        fw, bw = match.get_forward_backward(row.modified_sequence)
+        for z in range(row.precursor_charge):
+            zpp = z + 1
+            annotation = annotate.get_annotation(fw, bw, zpp, ito)
+            for ion, mz in annotation.items():
+                it, _in, nloss = parse_ion(ion)
+                array[_in, it, nloss, z] = mz
+        return [array]
 
-def peptidelist(df):
+    mzs_series = df.apply(calc_row, 1)
+    out = np.squeeze(np.stack(mzs_series))
+    if len(out.shape) == 4:
+        out = out.reshape([1] + list(out.shape))
+    return out
+
+
+def csv(df):
     df.reset_index(drop=True, inplace=True)
     assert "modified_sequence" in df.columns
     assert "collision_energy" in df.columns
     assert "precursor_charge" in df.columns
-    tensor = {
-        "collision_energy_aligned_normed": get_numbers(df.collision_energy) / 100.,
+    data = {
+        "collision_energy_aligned_normed": get_numbers(df.collision_energy) / 100.0,
         "sequence_integer": get_sequence_integer(df.modified_sequence),
         "precursor_charge_onehot": get_precursor_charge_onehot(df.precursor_charge),
+        "masses_pred": get_mz_applied(df),
     }
-    return tensor
-
-
-def msms_txt(df):
-    # TODO: implement
-    pass
+    nlosses = 1
+    z = 3
+    lengths = (data["sequence_integer"] > 0).sum(1)
+
+    masses_pred = get_mz_applied(df)
+    masses_pred = sanitize.cap(masses_pred, nlosses, z)
+    masses_pred = sanitize.mask_outofrange(masses_pred, lengths)
+    masses_pred = sanitize.mask_outofcharge(masses_pred, df.precursor_charge)
+    masses_pred = sanitize.reshape_flat(masses_pred)
+    data["masses_pred"] = masses_pred
+
+    return data

From 0b9569c7cd9828c5d60c381004535844afa72f8e Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Sat, 29 Jun 2019 17:40:49 +0200
Subject: [PATCH 11/18] :sparkles: add generic text output format

---
 Dockerfile                          |   2 +-
 README.md                           |  20 +++--
 prosit/__init__.py                  |   4 +-
 prosit/converters/__init__.py       |   2 +
 prosit/converters/generic.py        | 131 ++++++++++++++++++++++++++++
 prosit/{ => converters}/maxquant.py |   8 +-
 prosit/server.py                    |  21 ++++-
 7 files changed, 170 insertions(+), 18 deletions(-)
 create mode 100644 prosit/converters/__init__.py
 create mode 100644 prosit/converters/generic.py
 rename prosit/{ => converters}/maxquant.py (97%)

diff --git a/Dockerfile b/Dockerfile
index 880278a..89e8264 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 FROM tensorflow/tensorflow:1.10.1-gpu-py3
-RUN pip install keras==2.2.1 h5py tables flask
+RUN pip install keras==2.2.1 h5py tables flask pyteomics
 
 ENV KERAS_BACKEND=tensorflow
 ENV TF_CPP_MIN_LOG_LEVEL=3
diff --git a/README.md b/README.md
index 6edb379..5a21439 100644
--- a/README.md
+++ b/README.md
@@ -27,26 +27,30 @@ The time installation takes is dependent on your download speed (Prosit download
 
 ## Model
 
-Prosit assumes your model to be in a directory that includes:
+Prosit assumes your models are in directories that look like this:
 
 - model.yml - a saved keras model
 - config.yml - a model specifying names of inputs and outputs of the model
 - weights file(s) - that follow the template `weights_{epoch}_{loss}.hdf5`
 
-You can download a pre-trained model for HCD fragmentation prediction on https://figshare.com/projects/Prosit/35582.
+You can download pre-trained models for HCD fragmentation prediction and iRT prediction on https://figshare.com/projects/Prosit/35582.
 
 ## Usage
 
 The following command will load your model from `/path/to/model/`.
 In the example GPU device 0 is used for computation. The default PORT is 5000.
 
-    make server MODEL=/path/to/model/
+    make server MODEL_SPECTRA=/path/to/fragmentation_model/ MODEL_IRT=/path/to/irt_model/
+
+Currently two output formats are supported: a MaxQuant style `msms.txt` not including the iRT value and a generic text file (that works with Spectronaut)
 
 ## Example
 
-Please find an example input file at `example/peptidelist.csv`. After starting the server you can run:
+Please find an example input file at `example/peptidelist.csv`. After starting the server you can run the following commands, depending on what output format you prefer:
+
+    curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/msms
 
-    curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/
+    curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/generic
 
     The example takes about 4s to run. An expected output file can be found at `examples/output_msms.txt`.
 
@@ -58,9 +62,9 @@ Please note: Sequences with amino acid U, O, or X are not supported. Modificatio
 
 ## Pseudo-code
 
-1. Load the model given as MODEL environment variable
+1. Load the models given as in the MODEL\_X environment variables
 2. Start a server and wait for inputs
 3. On incomming request
     * transform peptide list to model input format (numpy arrays)
-    * predict fragment intensity with loaded model for given peptides
-    * transform prediction to msms.txt output format and return response
+    * predict fragment intensity and iRT with the loaded models for the given peptides
+    * transform prediction to the requested output format and return response
diff --git a/prosit/__init__.py b/prosit/__init__.py
index 042f8f5..d0e5b19 100644
--- a/prosit/__init__.py
+++ b/prosit/__init__.py
@@ -7,6 +7,6 @@
 from . import server
 from . import layers
 from . import sanitize
+from . import converters
 
-
-__version__ = "1.0"
+__version__ = "1.1"
diff --git a/prosit/converters/__init__.py b/prosit/converters/__init__.py
new file mode 100644
index 0000000..681720b
--- /dev/null
+++ b/prosit/converters/__init__.py
@@ -0,0 +1,2 @@
+from . import generic
+from . import maxquant
diff --git a/prosit/converters/generic.py b/prosit/converters/generic.py
new file mode 100644
index 0000000..3713cb7
--- /dev/null
+++ b/prosit/converters/generic.py
@@ -0,0 +1,131 @@
+import pandas as pd
+import numpy as np
+import multiprocessing as mp
+import pyteomics.mass
+
+from ..constants import MAX_ION, ION_TYPES, MAX_FRAG_CHARGE
+from .. import utils
+
+
+aa_comp = dict(pyteomics.mass.std_aa_comp)
+aa_comp["o"] = pyteomics.mass.Composition({"O": 1})
+translate2spectronaut = {"C": "C[Carbamidomethyl (C)]", "M(ox)": "M[Oxidation (M)]"}
+shape = [MAX_ION, len(ION_TYPES), MAX_FRAG_CHARGE]
+FragmentNumber = np.zeros(shape, dtype=int)
+FragmentType = np.zeros(shape, dtype="object")
+FragmentCharge = np.zeros(shape, dtype=int)
+
+for z in range(MAX_FRAG_CHARGE):
+    for j in range(MAX_ION):
+        for tyi, ty in enumerate(ION_TYPES):
+            FragmentNumber[j, tyi, z] = j + 1
+            FragmentType[j, tyi, z] = ty
+            FragmentCharge[j, tyi, z] = z + 1
+
+FragmentNumber = FragmentNumber.flatten()
+FragmentType = FragmentType.flatten()
+FragmentCharge = FragmentCharge.flatten()
+
+
+def convert_spectrum(data):
+    df = pd.DataFrame(
+        {
+            "RelativeIntensity": data["intensities_pred"],
+            "FragmentMz": data["masses_pred"],
+            "idx": list(range(174)),
+        }
+    )
+    spectrum = df[df.RelativeIntensity > 0].reset_index(drop=True)
+    idx = list(spectrum.idx)
+    sequence = utils.get_sequence(data["sequence_integer"])
+    charge = int(data["precursor_charge_onehot"].argmax() + 1)
+    irt = float(data["iRT"])
+    precursor_mz = pyteomics.mass.calculate_mass(
+        sequence=sequence.replace("M(ox)", "oM"), charge=charge, aa_comp=aa_comp
+    )
+
+    spectrum["ModifiedPeptide"] = sequence
+    spectrum["LabeledPeptide"] = sequence
+    spectrum["StrippedPeptide"] = spectrum.LabeledPeptide.map(
+        lambda p: p.replace("M(ox)", "M")
+    )
+    spectrum["PrecursorCharge"] = charge
+    spectrum["PrecursorMz"] = precursor_mz
+    spectrum["iRT"] = irt
+    spectrum["FragmentNumber"] = FragmentNumber[idx]
+    spectrum["FragmentType"] = FragmentType[idx]
+    spectrum["FragmentCharge"] = FragmentCharge[idx]
+    spectrum["FragmentLossType"] = "noloss"
+    for source, target in translate2spectronaut.items():
+        spectrum["ModifiedPeptide"] = spectrum.ModifiedPeptide.map(
+            lambda s: s.replace(source, target)
+        )
+    spectrum["ModifiedPeptide"] = spectrum.ModifiedPeptide.map(lambda s: "_" + s + "_")
+    del spectrum["idx"]
+    return spectrum
+
+
+class Converter:
+    def __init__(self, data, out_path, maxsize=256, batch_size=32):
+        self.data = data
+        self.out_path = out_path
+        self.queue = mp.Manager().Queue(maxsize)
+        self.batch_size = batch_size
+        self.cores = mp.cpu_count()
+
+    def batch(self, iterable):
+        l = len(iterable)
+        for ndx in range(0, l, self.batch_size):
+            yield iterable[ndx : min(ndx + self.batch_size, l)]
+
+    def slice_data(self, i):
+        return {k: d[i] for k, d in self.data.items()}
+
+    def fill_queue(self, pool):
+        n = self.data["sequence_integer"].shape[0]
+        indeces = list(range(n))
+
+        for b in self.batch(indeces):
+            spectra = pool.map(convert_spectrum, [self.slice_data(i) for i in b])
+            for s in spectra:
+                self.queue.put(s)
+
+        # Stop writing process
+        self.queue.put(None)
+
+    def get_converted(self):
+        while True:
+            x = self.queue.get()
+            if x is None:
+                break
+            else:
+                yield x
+
+    def to_csv(self):
+        # keeps file open
+        with open(self.out_path, "w") as _file:
+            converted = self.get_converted()
+            spectrum = next(converted)
+            spectrum.to_csv(_file, index=False)
+            for spectrum in converted:
+                spectrum.to_csv(_file, header=False, index=False)
+
+    def convert(self):
+        io_process = mp.Process(target=self.to_csv)
+        io_process.daemon = True
+        io_process.start()
+        with mp.Pool(processes=self.cores * 2) as pool:
+            self.fill_queue(pool)
+        io_process.join()
+
+
+if __name__ == "__main__":
+
+    #data = pwyll.tensorize.read(HDF5_PATH)
+    conv = ConverterSP(data, to_spectronaut, OUT_PATH)
+    io_process = mp.Process(target=conv.to_csv)
+    io_process.daemon = True
+    io_process.start()
+    with mp.Pool(processes=N_CORES * 2) as pool:
+        conv.fill_queue(pool)
+    io_process.join()
diff --git a/prosit/maxquant.py b/prosit/converters/maxquant.py
similarity index 97%
rename from prosit/maxquant.py
rename to prosit/converters/maxquant.py
index 50124ce..4645724 100644
--- a/prosit/maxquant.py
+++ b/prosit/converters/maxquant.py
@@ -2,10 +2,10 @@
 
 import os
 
-from . import constants
-from . import match
-from . import annotate
-from . import utils
+from .. import constants
+from .. import match
+from .. import annotate
+from .. import utils
 
 COL_SEP = "\t"
 
diff --git a/prosit/server.py b/prosit/server.py
index 1014682..3cc0fc9 100644
--- a/prosit/server.py
+++ b/prosit/server.py
@@ -12,7 +12,7 @@
 from . import tensorize
 from . import prediction
 from . import alignment
-from . import maxquant
+from . import converters
 
 
 app = flask.Flask(__name__)
@@ -31,12 +31,27 @@ def predict(csv):
     return data
 
 
+@app.route("/predict/generic", methods=["POST"])
+def return_generic():
+    result = predict(flask.request.files["peptides"])
+    tmp_f = tempfile.NamedTemporaryFile(delete=True)
+    c = converters.generic.Converter(result, tmp_f.name)
+    c.convert()
+
+    @after_this_request
+    def cleanup(response):
+        tmp_f.close()
+        return response
+
+    return flask.send_file(tmp_f.name)
+
+
 @app.route("/predict/msms", methods=["POST"])
 def return_msms():
     result = predict(flask.request.files["peptides"])
-    df_pred = maxquant.convert_prediction(result)
+    df_pred = converters.maxquant.convert_prediction(result)
     tmp_f = tempfile.NamedTemporaryFile(delete=True)
-    maxquant.write(df_pred, tmp_f.name)
+    converters.maxquant.write(df_pred, tmp_f.name)
 
     @after_this_request
     def cleanup(response):

From 159b9b02c4ebd0b4df23202d9dc624af3032434e Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Mon, 1 Jul 2019 09:29:27 +0200
Subject: [PATCH 12/18] :sparkles: initial addition of msp

---
 Dockerfile                    |   2 +-
 prosit/converters/__init__.py |   1 +
 prosit/converters/msp.py      | 202 ++++++++++++++++++++++++++++++++++
 3 files changed, 204 insertions(+), 1 deletion(-)
 create mode 100644 prosit/converters/msp.py

diff --git a/Dockerfile b/Dockerfile
index 89e8264..4251b28 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,5 +1,5 @@
 FROM tensorflow/tensorflow:1.10.1-gpu-py3
-RUN pip install keras==2.2.1 h5py tables flask pyteomics
+RUN pip install keras==2.2.1 h5py tables flask pyteomics lxml
 
 ENV KERAS_BACKEND=tensorflow
 ENV TF_CPP_MIN_LOG_LEVEL=3
diff --git a/prosit/converters/__init__.py b/prosit/converters/__init__.py
index 681720b..21b0e6f 100644
--- a/prosit/converters/__init__.py
+++ b/prosit/converters/__init__.py
@@ -1,2 +1,3 @@
 from . import generic
 from . import maxquant
+from . import msp
diff --git a/prosit/converters/msp.py b/prosit/converters/msp.py
new file mode 100644
index 0000000..208f486
--- /dev/null
+++ b/prosit/converters/msp.py
@@ -0,0 +1,202 @@
+import pandas as pd
+import numpy as np
+import h5py
+import pyteomics
+
+from .. import constants
+from .. import utils
+
+class Converter():
+    def __init__(self, data, out_path):
+        self.out_path = out_path
+        self.data = data
+
+    def convert(self):
+        IONS = get_ions().reshape(174, -1).flatten()
+        # TODO VECTORIZE CHARGE AND ENERGY
+        with open(self.out_path, mode="w", encoding="utf-8") as f:
+            first_spec = True
+            for i in range(self.data["iRT"].shape[0]):
+                aIntensity = self.data["intensities_pred"][i]
+                sel = np.where(aIntensity > 0)
+                aIntensity = aIntensity[sel]
+                print(aIntensity)
+                collision_energy = self.data["collision_energy_aligned_normed"][i] * 100
+                iRT = self.data["iRT"][i]
+                aMass = self.data["masses_pred"][i][sel]
+                precursor_charge = self.data["precursor_charge_onehot"][i]
+                sequence_integer = self.data["sequence_integer"][i]
+                aIons = IONS[sel]
+                spec = Spectrum(
+                    aIntensity,
+                    collision_energy,
+                    iRT,
+                    aMass,
+                    precursor_charge,
+                    sequence_integer,
+                    aIons,
+                )
+                if not first_spec:
+                    f.write("\n")
+                first_spec = False
+                f.write(str(spec))
+        return spec
+
+
+def generate_aa_comp():
+    """
+    >>> aa_comp = generate_aa_comp()
+    >>> aa_comp["M"]
+    Composition({'H': 9, 'C': 5, 'S': 1, 'O': 1, 'N': 1})
+    >>> aa_comp["Z"]
+    Composition({'H': 9, 'C': 5, 'S': 1, 'O': 2, 'N': 1})
+    """
+    db = pyteomics.mass.Unimod()
+    aa_comp = dict(pyteomics.mass.std_aa_comp)
+    s = db.by_title("Oxidation")["composition"]
+    aa_comp["Z"] = aa_comp["M"] + s
+    s = db.by_title("Carbamidomethyl")["composition"]
+    aa_comp["C"] = aa_comp["C"] + s
+    return aa_comp
+
+
+aa_comp = generate_aa_comp()
+
+
+
+
+def get_ions():
+    x = np.empty(
+        [constants.MAX_ION, len(constants.ION_TYPES), constants.MAX_FRAG_CHARGE],
+        dtype="|S6",
+    )
+    for fz in range(constants.MAX_FRAG_CHARGE):
+        for fty_i, fty in enumerate(constants.ION_TYPES):
+            for fi in range(constants.MAX_ION):
+                ion = fty + str(fi + 1)
+                if fz > 0:
+                    ion += "({}+)".format(fz + 1)
+                x[fi, fty_i, fz] = ion
+    x.flatten()
+    return x
+
+
+ox_int = constants.ALPHABET["M(ox)"]
+c_int = constants.ALPHABET["C"]
+
+
+def calculate_mods(sequence_integer):
+    """
+    >>> x = np.array([2, 15, 4, 3, 0, 0])
+    >>> calculate_mods(x)
+    1
+    >>> x = np.array([2, 15, 21, 3, 0, 0])
+    >>> calculate_mods(x)
+    2
+    """
+    # TODO could be vectorized in numpy
+    return len(np.where((sequence_integer == ox_int) | (sequence_integer == c_int))[0])
+
+
+def generate_mods_string_tuples(sequence_integer):
+    list_mods = []
+    for mod in [ox_int, c_int]:
+        for position in np.where(sequence_integer == mod)[0]:
+            if mod == c_int:
+                list_mods.append((position, "C", "Carbamidomethyl"))
+            elif mod == ox_int:
+                list_mods.append((position, "M", "Oxidation"))
+            else:
+                raise ValueError("cant be true")
+    list_mods.sort(key=lambda tup: tup[0])  # inplace
+    return list_mods
+
+
+def generate_mod_strings(sequence_integer):
+    """
+    >>> x = np.array([1,2,3,1,2,21,0])
+    >>> y, z = generate_mod_strings(x)
+    >>> y
+    '3/1,C,Carbamidomethyl/4,C,Carbamidomethyl/5,M,Oxidation'
+    >>> z
+    'Carbamidomethyl@C2; Carbamidomethyl@C5; Oxidation@M6'
+    """
+    list_mods = generate_mods_string_tuples(sequence_integer)
+    if len(list_mods) == 0:
+        return "0", ""
+    else:
+        returnString_mods = ""
+        returnString_modString = ""
+        returnString_mods += str(len(list_mods))
+        for i, mod_tuple in enumerate(list_mods):
+            returnString_mods += (
+                "/" + str(mod_tuple[0]) + "," + mod_tuple[1] + "," + mod_tuple[2]
+            )
+            if i == 0:
+                returnString_modString += (
+                    mod_tuple[2] + "@" + mod_tuple[1] + str(mod_tuple[0] + 1)
+                )
+            else:
+                returnString_modString += (
+                    "; " + mod_tuple[2] + "@" + mod_tuple[1] + str(mod_tuple[0] + 1)
+                )
+
+    return returnString_mods, returnString_modString
+
+
+class Spectrum(object):
+    def __init__(
+        self,
+        aIntensity,
+        collision_energy,
+        iRT,
+        aMass,
+        precursor_charge,
+        sequence_integer,
+        aIons,
+    ):
+        self.aIntensity = aIntensity
+        self.collision_energy = collision_energy
+        self.iRT = iRT
+        self.aMass = aMass
+        self.precursor_charge = precursor_charge
+        self.aIons = aIons
+        self.mod, self.mod_string = generate_mod_strings(sequence_integer)
+        self.sequence = utils.get_sequence(sequence_integer)
+        # amino acid Z which is defined at the toplevel in generate_aa_comp
+        self.precursor_mass = pyteomics.mass.calculate_mass(
+            self.sequence.replace("M(ox)", "Z"),
+            aa_comp=aa_comp,
+            ion_type="M",
+            charge=int(self.precursor_charge),
+        )
+
+        # TODO clean solution https://pyteomics.readthedocs.io/en/latest/mass.html
+
+    def __str__(self):
+        s = "Name: {sequence}/{charge}\nMW: {precursor_mass}\n"
+        s += "Comment: Parent={precursor_mass} Collision_energy={collision_energy} "
+        s += "Mods={mod} ModString={sequence}//{mod_string}/{charge}"
+        s += "\nNum peaks: {num_peaks}"
+        num_peaks = len(self.aIntensity)
+        s = s.format(
+            sequence=self.sequence.replace("M(ox)", "M"),
+            charge=self.precursor_charge,
+            precursor_mass=self.precursor_mass,
+            collision_energy=np.round(self.collision_energy[0], 0),
+            mod=self.mod,
+            mod_string=self.mod_string,
+            num_peaks=num_peaks,
+        )
+        for mz, intensity, ion in zip(self.aMass, self.aIntensity, self.aIons):
+            s += "\n" + str(mz) + "\t" + str(intensity) + '\t"'
+            s += ion.decode("UTF-8").replace("(", "^").replace("+", "") + '/0.0ppm"'
+        return s
+
+
+if __name__ == "__main__":
+    HDF5_PATH = "/mnt/global/ucc_ml/bierdimpfl/workDir/20/data.hdf5"
+    OUTPATH = "myPrositLib.msp"
+    # HDF5_PATH = "/home/tschmidt/tmp/hela/forward/data.hdf5"
+    data = load_data(HDF5_PATH)
+    s = iter_data(data, OUTPATH)

From 5feef35fca9c273e8e2aeb280967a2bd546cbf40 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Tue, 2 Jul 2019 13:45:16 +0200
Subject: [PATCH 13/18] :sparkles add msp output format

---
 prosit/converters/msp.py | 86 ++++++++++++++++------------------------
 prosit/server.py         | 15 +++++++
 2 files changed, 50 insertions(+), 51 deletions(-)

diff --git a/prosit/converters/msp.py b/prosit/converters/msp.py
index 208f486..e0f0b1e 100644
--- a/prosit/converters/msp.py
+++ b/prosit/converters/msp.py
@@ -1,47 +1,9 @@
-import pandas as pd
 import numpy as np
-import h5py
 import pyteomics
 
 from .. import constants
 from .. import utils
 
-class Converter():
-    def __init__(self, data, out_path):
-        self.out_path = out_path
-        self.data = data
-
-    def convert(self):
-        IONS = get_ions().reshape(174, -1).flatten()
-        # TODO VECTORIZE CHARGE AND ENERGY
-        with open(self.out_path, mode="w", encoding="utf-8") as f:
-            first_spec = True
-            for i in range(self.data["iRT"].shape[0]):
-                aIntensity = self.data["intensities_pred"][i]
-                sel = np.where(aIntensity > 0)
-                aIntensity = aIntensity[sel]
-                print(aIntensity)
-                collision_energy = self.data["collision_energy_aligned_normed"][i] * 100
-                iRT = self.data["iRT"][i]
-                aMass = self.data["masses_pred"][i][sel]
-                precursor_charge = self.data["precursor_charge_onehot"][i]
-                sequence_integer = self.data["sequence_integer"][i]
-                aIons = IONS[sel]
-                spec = Spectrum(
-                    aIntensity,
-                    collision_energy,
-                    iRT,
-                    aMass,
-                    precursor_charge,
-                    sequence_integer,
-                    aIons,
-                )
-                if not first_spec:
-                    f.write("\n")
-                first_spec = False
-                f.write(str(spec))
-        return spec
-
 
 def generate_aa_comp():
     """
@@ -63,8 +25,6 @@ def generate_aa_comp():
 aa_comp = generate_aa_comp()
 
 
-
-
 def get_ions():
     x = np.empty(
         [constants.MAX_ION, len(constants.ION_TYPES), constants.MAX_FRAG_CHARGE],
@@ -94,7 +54,6 @@ def calculate_mods(sequence_integer):
     >>> calculate_mods(x)
     2
     """
-    # TODO could be vectorized in numpy
     return len(np.where((sequence_integer == ox_int) | (sequence_integer == c_int))[0])
 
 
@@ -144,6 +103,41 @@ def generate_mod_strings(sequence_integer):
     return returnString_mods, returnString_modString
 
 
+class Converter():
+    def __init__(self, data, out_path):
+        self.out_path = out_path
+        self.data = data
+
+    def convert(self):
+        IONS = get_ions().reshape(174, -1).flatten()
+        with open(self.out_path, mode="w", encoding="utf-8") as f:
+            first_spec = True
+            for i in range(self.data["iRT"].shape[0]):
+                aIntensity = self.data["intensities_pred"][i]
+                sel = np.where(aIntensity > 0)
+                aIntensity = aIntensity[sel]
+                collision_energy = self.data["collision_energy_aligned_normed"][i] * 100
+                iRT = self.data["iRT"][i]
+                aMass = self.data["masses_pred"][i][sel]
+                precursor_charge = self.data["precursor_charge_onehot"][i].argmax() + 1
+                sequence_integer = self.data["sequence_integer"][i]
+                aIons = IONS[sel]
+                spec = Spectrum(
+                    aIntensity,
+                    collision_energy,
+                    iRT,
+                    aMass,
+                    precursor_charge,
+                    sequence_integer,
+                    aIons,
+                )
+                if not first_spec:
+                    f.write("\n")
+                first_spec = False
+                f.write(str(spec))
+        return spec
+
+
 class Spectrum(object):
     def __init__(
         self,
@@ -171,8 +165,6 @@ def __init__(
             charge=int(self.precursor_charge),
         )
 
-        # TODO clean solution https://pyteomics.readthedocs.io/en/latest/mass.html
-
     def __str__(self):
         s = "Name: {sequence}/{charge}\nMW: {precursor_mass}\n"
         s += "Comment: Parent={precursor_mass} Collision_energy={collision_energy} "
@@ -192,11 +184,3 @@ def __str__(self):
             s += "\n" + str(mz) + "\t" + str(intensity) + '\t"'
             s += ion.decode("UTF-8").replace("(", "^").replace("+", "") + '/0.0ppm"'
         return s
-
-
-if __name__ == "__main__":
-    HDF5_PATH = "/mnt/global/ucc_ml/bierdimpfl/workDir/20/data.hdf5"
-    OUTPATH = "myPrositLib.msp"
-    # HDF5_PATH = "/home/tschmidt/tmp/hela/forward/data.hdf5"
-    data = load_data(HDF5_PATH)
-    s = iter_data(data, OUTPATH)
diff --git a/prosit/server.py b/prosit/server.py
index 3cc0fc9..a4480ca 100644
--- a/prosit/server.py
+++ b/prosit/server.py
@@ -46,6 +46,21 @@ def cleanup(response):
     return flask.send_file(tmp_f.name)
 
 
+@app.route("/predict/msp", methods=["POST"])
+def return_msp():
+    result = predict(flask.request.files["peptides"])
+    tmp_f = tempfile.NamedTemporaryFile(delete=True)
+    c = converters.msp.Converter(result, tmp_f.name)
+    c.convert()
+
+    @after_this_request
+    def cleanup(response):
+        tmp_f.close()
+        return response
+
+    return flask.send_file(tmp_f.name)
+
+
 @app.route("/predict/msms", methods=["POST"])
 def return_msms():
     result = predict(flask.request.files["peptides"])

From cf31b5ba1b3295440299099612f6750beeabb62c Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Tue, 2 Jul 2019 14:16:54 +0200
Subject: [PATCH 14/18] :books: update readme - output formats

---
 README.md                    |  8 ++--
 examples/output_msms.txt     |  4 --
 examples/peptidelist.generic | 82 +++++++++++++++++++++++++++++++
 examples/peptidelist.msms    |  4 ++
 examples/peptidelist.msp     | 93 ++++++++++++++++++++++++++++++++++++
 5 files changed, 184 insertions(+), 7 deletions(-)
 delete mode 100644 examples/output_msms.txt
 create mode 100644 examples/peptidelist.generic
 create mode 100644 examples/peptidelist.msms
 create mode 100644 examples/peptidelist.msp

diff --git a/README.md b/README.md
index 5a21439..4dc40a3 100644
--- a/README.md
+++ b/README.md
@@ -48,11 +48,13 @@ Currently two output formats are supported: a MaxQuant style `msms.txt` not incl
 
 Please find an example input file at `example/peptidelist.csv`. After starting the server you can run the following commands, depending on what output format you prefer:
 
-    curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/msms
-
     curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/generic
 
-    The example takes about 4s to run. An expected output file can be found at `examples/output_msms.txt`.
+    curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/msp
+
+    curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/msms
+
+    The examples takes about 4s to run. Expected output files (.generic, .msp and .msms) can be found in `examples/`.
 
 ## Using Prosit on your data
 
diff --git a/examples/output_msms.txt b/examples/output_msms.txt
deleted file mode 100644
index 0d6628d..0000000
--- a/examples/output_msms.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-Intensities	Masses	Matches	Modified Sequence	Charge
-1.0;0.412596;0.730622;0.304245;0.116402;0.0937164;0.0809876;0.192307;0.0859734;0.15234;0.0825881;0.0125858;0.0341731;0.0134653;0.0030653;0.00143102;0.00824674;0.00419197;0.000614032;0.00130399;0.0022207;0.000594303	175.118952167;322.154347167;435.238411167;548.322475167;619.359589167;690.396703167;761.433817167;263.088246467;360.141010467;431.178124467;502.215238467;573.252352467;218.122843817;429.746928817;132.047761467;180.574143467;216.092700467;251.611257467;207.124714034;230.803752034;254.482790034;229.450323134	y1;y2;y3;y4;y5;y6;y7;b2;b3;b4;b5;b6;y3(2+);y8(2+);b2(2+);b3(2+);b4(2+);b5(2+);y5(3+);y6(3+);y7(3+);b7(3+)	MMPAAALIM(ox)R	3
-0.0318386;0.0534038;0.000866649;0.165916;0.61992;1.0;0.37558;0.00687848;0.85537;0.186023;0.00952645;0.125391;0.666909;0.0563227;0.000254796;0.00419896	147.112804167;294.148199167;407.232263167;504.285027167;601.337791167;698.390555167;769.427669167;882.511733167;245.131825467;316.168939467;413.221703467;301.172533817;349.698915817;385.217472817;158.588107967;207.114489967	y1;y2;y3;y4;y5;y6;y7;y8;b2;b3;b4;y5(2+);y6(2+);y7(2+);b3(2+);b4(2+)	MLAPPPIM(ox)K	2
-0.647814;0.762372;1.0;0.785805;0.244676;0.0376843;0.985027;0.468406;0.305399;0.572137;0.273087;0.0259239;0.315776;0.694665;0.426082;0.00298714;0.0162049;0.0243926;0.00191334;0.027638;0.00831755;0.00103351	175.118952167;419.207111167;516.259875167;613.312639167;710.365403167;132.047761467;288.148872467;359.185986467;472.270050467;585.354114467;698.438178467;811.522242467;258.633575817;307.159957817;355.686339817;180.096631467;236.638663467;293.180695467;349.722727467;59.0445017003;237.459985367;44.6874381337	y1;y3;y4;y5;y6;b1;b2;b3;b4;b5;b6;b7;y4(2+);y5(2+);y6(2+);b3(2+);b4(2+);b5(2+);b6(2+);y1(3+);y6(3+);b1(3+)	MRALLLIPPPPM(ox)R	6
diff --git a/examples/peptidelist.generic b/examples/peptidelist.generic
new file mode 100644
index 0000000..00bfe84
--- /dev/null
+++ b/examples/peptidelist.generic
@@ -0,0 +1,82 @@
+FragmentMz,RelativeIntensity,ModifiedPeptide,LabeledPeptide,StrippedPeptide,PrecursorCharge,PrecursorMz,iRT,FragmentNumber,FragmentType,FragmentCharge,FragmentLossType
+175.118952167,0.55108744,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,1,y,1,noloss
+44.68743813366666,0.17598717,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,1,b,3,noloss
+322.15434716699997,0.7980077,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,2,y,1,noloss
+88.36759980033332,0.17649946,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,2,b,3,noloss
+435.238411167,0.9098875,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,3,y,1,noloss
+120.71852113366667,0.13785253,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,3,b,3,noloss
+548.322475167,0.96106666,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,4,y,1,noloss
+144.39755913366665,0.101257205,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,4,b,3,noloss
+619.359589167,0.9845231,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,5,y,1,noloss
+168.07659713366664,0.075195946,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,5,b,3,noloss
+690.3967031669999,0.99360347,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,6,y,1,noloss
+191.75563513366663,0.058385145,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,6,b,3,noloss
+761.4338171669999,0.9969484,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,7,y,1,noloss
+229.45032313366664,0.0481462,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,7,b,3,noloss
+858.4865811669999,0.99818367,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,8,y,1,noloss
+267.1450111336666,0.04209969,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,8,b,3,noloss
+989.5270661669999,0.99877864,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,9,y,1,noloss
+316.15680946699996,0.038592704,_MMPAAALIM[Oxidation (M)]R_,MMPAAALIM(ox)R,MMPAAALIMR,3,374.19403587446,56.413185119628906,9,b,3,noloss
+74.060040317,0.19450517,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,1,y,2,noloss
+132.047761467,0.006076492,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,1,b,1,noloss
+66.52751896699999,0.5418572,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,1,b,2,noloss
+147.57773781699998,0.34726548,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,2,y,2,noloss
+245.131825467,0.009780246,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,2,b,1,noloss
+123.069550967,0.79382604,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,2,b,2,noloss
+204.119769817,0.45179287,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,3,y,2,noloss
+316.168939467,0.018071838,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,3,b,1,noloss
+158.58810796699998,0.911572,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,3,b,2,noloss
+252.646151817,0.5201007,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,4,y,2,noloss
+413.221703467,0.028130919,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,4,b,1,noloss
+207.114489967,0.9655734,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,4,b,2,noloss
+301.17253381700004,0.56388813,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,5,y,2,noloss
+510.274467467,0.03755645,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,5,b,1,noloss
+255.640871967,0.9890444,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,5,b,2,noloss
+349.69891581700006,0.59052,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,6,y,2,noloss
+607.327231467,0.045440182,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,6,b,1,noloss
+304.167253967,0.99782693,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,6,b,2,noloss
+385.21747281700004,0.6069058,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,7,y,2,noloss
+720.4112954670001,0.051495925,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,7,b,1,noloss
+360.709285967,1.0,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,7,b,2,noloss
+441.75950481700005,0.6169979,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,8,y,2,noloss
+867.446690467,0.05591029,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,8,b,1,noloss
+434.226983467,0.9994731,_MLAPPPIM[Oxidation (M)]K_,MLAPPPIM(ox)K,MLAPPPIMK,2,507.27974918115007,56.4251594543457,8,b,2,noloss
+88.063114317,0.30252227,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,1,y,2,noloss
+59.044501700333335,0.46046332,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,1,y,3,noloss
+66.52751896699999,0.47418872,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,1,b,2,noloss
+44.68743813366666,0.21134187,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,1,b,3,noloss
+161.58081181699998,0.4249473,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,2,y,2,noloss
+108.05630003366666,0.6548788,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,2,y,3,noloss
+144.578074467,0.718152,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,2,b,2,noloss
+96.72114180033333,0.17981398,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,2,b,3,noloss
+210.10719381699997,0.47798508,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,3,y,2,noloss
+140.40722136699998,0.7354044,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,3,y,3,noloss
+180.096631467,0.84923667,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,3,b,2,noloss
+120.40017980033333,0.07393339,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,3,b,3,noloss
+258.633575817,0.5043297,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,4,y,2,noloss
+172.75814270033334,0.76776385,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,4,y,3,noloss
+236.63866346700001,0.92138934,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,4,b,2,noloss
+307.159957817,0.51941454,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,5,y,2,noloss
+205.10906403366667,0.7802104,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,5,y,3,noloss
+293.180695467,0.9610952,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,5,b,2,noloss
+355.68633981700003,0.52809644,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,6,y,2,noloss
+237.45998536700003,0.78521365,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,6,y,3,noloss
+349.722727467,0.98227686,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,6,b,2,noloss
+412.22837181700004,0.5333996,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,7,y,2,noloss
+275.15467336700004,0.7872712,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,7,y,3,noloss
+406.26475946700003,0.99297327,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,7,b,2,noloss
+468.77040381700004,0.5366646,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,8,y,2,noloss
+312.84936136700003,0.78817904,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,8,y,3,noloss
+454.79114146700005,0.9978749,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,8,b,2,noloss
+525.312435817,0.53864604,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,9,y,2,noloss
+350.54404936700007,0.78867465,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,9,y,3,noloss
+503.31752346700006,0.9997002,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,9,b,2,noloss
+581.854467817,0.5397395,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,10,y,2,noloss
+388.23873736700006,0.7890641,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,10,y,3,noloss
+551.843905467,1.0,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,10,b,2,noloss
+617.373024817,0.54010165,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,11,y,2,noloss
+411.91777536700005,0.78956795,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,11,y,3,noloss
+600.370287467,0.9995948,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,11,b,2,noloss
+695.423580317,0.54046255,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,12,y,2,noloss
+463.95147903366666,0.78978145,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,12,y,3,noloss
+673.8879849670001,0.9990461,_MRALLLIPPPPM[Oxidation (M)]R_,MRALLLIPPPPM(ox)R,MRALLLIPPPPMR,6,254.31945917761504,56.41558837890625,12,b,2,noloss
diff --git a/examples/peptidelist.msms b/examples/peptidelist.msms
new file mode 100644
index 0000000..0473c20
--- /dev/null
+++ b/examples/peptidelist.msms
@@ -0,0 +1,4 @@
+Intensities	Masses	Matches	Modified Sequence	Charge
+0.55108744;0.7980077;0.9098875;0.96106666;0.9845231;0.99360347;0.9969484;0.99818367;0.99877864;0.17598717;0.17649946;0.13785253;0.101257205;0.075195946;0.058385145;0.0481462;0.04209969;0.038592704	175.118952167;322.15434716699997;435.238411167;548.322475167;619.359589167;690.3967031669999;761.4338171669999;858.4865811669999;989.5270661669999;44.68743813366666;88.36759980033332;120.71852113366667;144.39755913366665;168.07659713366664;191.75563513366663;229.45032313366664;267.1450111336666;316.15680946699996	y1;y2;y3;y4;y5;y6;y7;y8;y9;b1(3+);b2(3+);b3(3+);b4(3+);b5(3+);b6(3+);b7(3+);b8(3+);b9(3+)	MMPAAALIM(ox)R	3
+0.006076492;0.009780246;0.018071838;0.028130919;0.03755645;0.045440182;0.051495925;0.05591029;0.19450517;0.34726548;0.45179287;0.5201007;0.56388813;0.59052;0.6069058;0.6169979;0.5418572;0.79382604;0.911572;0.9655734;0.9890444;0.99782693;1.0;0.9994731	132.047761467;245.131825467;316.168939467;413.221703467;510.274467467;607.327231467;720.4112954670001;867.446690467;74.060040317;147.57773781699998;204.119769817;252.646151817;301.17253381700004;349.69891581700006;385.21747281700004;441.75950481700005;66.52751896699999;123.069550967;158.58810796699998;207.114489967;255.640871967;304.167253967;360.709285967;434.226983467	b1;b2;b3;b4;b5;b6;b7;b8;y1(2+);y2(2+);y3(2+);y4(2+);y5(2+);y6(2+);y7(2+);y8(2+);b1(2+);b2(2+);b3(2+);b4(2+);b5(2+);b6(2+);b7(2+);b8(2+)	MLAPPPIM(ox)K	2
+0.30252227;0.4249473;0.47798508;0.5043297;0.51941454;0.52809644;0.5333996;0.5366646;0.53864604;0.5397395;0.54010165;0.54046255;0.47418872;0.718152;0.84923667;0.92138934;0.9610952;0.98227686;0.99297327;0.9978749;0.9997002;1.0;0.9995948;0.9990461;0.46046332;0.6548788;0.7354044;0.76776385;0.7802104;0.78521365;0.7872712;0.78817904;0.78867465;0.7890641;0.78956795;0.78978145;0.21134187;0.17981398;0.07393339	88.063114317;161.58081181699998;210.10719381699997;258.633575817;307.159957817;355.68633981700003;412.22837181700004;468.77040381700004;525.312435817;581.854467817;617.373024817;695.423580317;66.52751896699999;144.578074467;180.096631467;236.63866346700001;293.180695467;349.722727467;406.26475946700003;454.79114146700005;503.31752346700006;551.843905467;600.370287467;673.8879849670001;59.044501700333335;108.05630003366666;140.40722136699998;172.75814270033334;205.10906403366667;237.45998536700003;275.15467336700004;312.84936136700003;350.54404936700007;388.23873736700006;411.91777536700005;463.95147903366666;44.68743813366666;96.72114180033333;120.40017980033333	y1(2+);y2(2+);y3(2+);y4(2+);y5(2+);y6(2+);y7(2+);y8(2+);y9(2+);y10(2+);y11(2+);y12(2+);b1(2+);b2(2+);b3(2+);b4(2+);b5(2+);b6(2+);b7(2+);b8(2+);b9(2+);b10(2+);b11(2+);b12(2+);y1(3+);y2(3+);y3(3+);y4(3+);y5(3+);y6(3+);y7(3+);y8(3+);y9(3+);y10(3+);y11(3+);y12(3+);b1(3+);b2(3+);b3(3+)	MRALLLIPPPPM(ox)R	6
diff --git a/examples/peptidelist.msp b/examples/peptidelist.msp
new file mode 100644
index 0000000..ba8bf56
--- /dev/null
+++ b/examples/peptidelist.msp
@@ -0,0 +1,93 @@
+Name: MMPAAALIMR/3
+MW: 374.19403587446
+Comment: Parent=374.19403587446 Collision_energy=35.0 Mods=1/8,M,Oxidation ModString=MMPAAALIMR//Oxidation@M9/3
+Num peaks: 18
+175.118952167	0.55108744	"y1/0.0ppm"
+44.68743813366666	0.17598717	"b1^3)/0.0ppm"
+322.15434716699997	0.7980077	"y2/0.0ppm"
+88.36759980033332	0.17649946	"b2^3)/0.0ppm"
+435.238411167	0.9098875	"y3/0.0ppm"
+120.71852113366667	0.13785253	"b3^3)/0.0ppm"
+548.322475167	0.96106666	"y4/0.0ppm"
+144.39755913366665	0.101257205	"b4^3)/0.0ppm"
+619.359589167	0.9845231	"y5/0.0ppm"
+168.07659713366664	0.075195946	"b5^3)/0.0ppm"
+690.3967031669999	0.99360347	"y6/0.0ppm"
+191.75563513366663	0.058385145	"b6^3)/0.0ppm"
+761.4338171669999	0.9969484	"y7/0.0ppm"
+229.45032313366664	0.0481462	"b7^3)/0.0ppm"
+858.4865811669999	0.99818367	"y8/0.0ppm"
+267.1450111336666	0.04209969	"b8^3)/0.0ppm"
+989.5270661669999	0.99877864	"y9/0.0ppm"
+316.15680946699996	0.038592704	"b9^3)/0.0ppm"
+Name: MLAPPPIMK/2
+MW: 507.27974918115007
+Comment: Parent=507.27974918115007 Collision_energy=30.0 Mods=1/7,M,Oxidation ModString=MLAPPPIMK//Oxidation@M8/2
+Num peaks: 24
+74.060040317	0.19450517	"y1^2)/0.0ppm"
+132.047761467	0.006076492	"b1/0.0ppm"
+66.52751896699999	0.5418572	"b1^2)/0.0ppm"
+147.57773781699998	0.34726548	"y2^2)/0.0ppm"
+245.131825467	0.009780246	"b2/0.0ppm"
+123.069550967	0.79382604	"b2^2)/0.0ppm"
+204.119769817	0.45179287	"y3^2)/0.0ppm"
+316.168939467	0.018071838	"b3/0.0ppm"
+158.58810796699998	0.911572	"b3^2)/0.0ppm"
+252.646151817	0.5201007	"y4^2)/0.0ppm"
+413.221703467	0.028130919	"b4/0.0ppm"
+207.114489967	0.9655734	"b4^2)/0.0ppm"
+301.17253381700004	0.56388813	"y5^2)/0.0ppm"
+510.274467467	0.03755645	"b5/0.0ppm"
+255.640871967	0.9890444	"b5^2)/0.0ppm"
+349.69891581700006	0.59052	"y6^2)/0.0ppm"
+607.327231467	0.045440182	"b6/0.0ppm"
+304.167253967	0.99782693	"b6^2)/0.0ppm"
+385.21747281700004	0.6069058	"y7^2)/0.0ppm"
+720.4112954670001	0.051495925	"b7/0.0ppm"
+360.709285967	1.0	"b7^2)/0.0ppm"
+441.75950481700005	0.6169979	"y8^2)/0.0ppm"
+867.446690467	0.05591029	"b8/0.0ppm"
+434.226983467	0.9994731	"b8^2)/0.0ppm"
+Name: MRALLLIPPPPMR/6
+MW: 254.31945917761504
+Comment: Parent=254.31945917761504 Collision_energy=30.0 Mods=1/11,M,Oxidation ModString=MRALLLIPPPPMR//Oxidation@M12/6
+Num peaks: 39
+88.063114317	0.30252227	"y1^2)/0.0ppm"
+59.044501700333335	0.46046332	"y1^3)/0.0ppm"
+66.52751896699999	0.47418872	"b1^2)/0.0ppm"
+44.68743813366666	0.21134187	"b1^3)/0.0ppm"
+161.58081181699998	0.4249473	"y2^2)/0.0ppm"
+108.05630003366666	0.6548788	"y2^3)/0.0ppm"
+144.578074467	0.718152	"b2^2)/0.0ppm"
+96.72114180033333	0.17981398	"b2^3)/0.0ppm"
+210.10719381699997	0.47798508	"y3^2)/0.0ppm"
+140.40722136699998	0.7354044	"y3^3)/0.0ppm"
+180.096631467	0.84923667	"b3^2)/0.0ppm"
+120.40017980033333	0.07393339	"b3^3)/0.0ppm"
+258.633575817	0.5043297	"y4^2)/0.0ppm"
+172.75814270033334	0.76776385	"y4^3)/0.0ppm"
+236.63866346700001	0.92138934	"b4^2)/0.0ppm"
+307.159957817	0.51941454	"y5^2)/0.0ppm"
+205.10906403366667	0.7802104	"y5^3)/0.0ppm"
+293.180695467	0.9610952	"b5^2)/0.0ppm"
+355.68633981700003	0.52809644	"y6^2)/0.0ppm"
+237.45998536700003	0.78521365	"y6^3)/0.0ppm"
+349.722727467	0.98227686	"b6^2)/0.0ppm"
+412.22837181700004	0.5333996	"y7^2)/0.0ppm"
+275.15467336700004	0.7872712	"y7^3)/0.0ppm"
+406.26475946700003	0.99297327	"b7^2)/0.0ppm"
+468.77040381700004	0.5366646	"y8^2)/0.0ppm"
+312.84936136700003	0.78817904	"y8^3)/0.0ppm"
+454.79114146700005	0.9978749	"b8^2)/0.0ppm"
+525.312435817	0.53864604	"y9^2)/0.0ppm"
+350.54404936700007	0.78867465	"y9^3)/0.0ppm"
+503.31752346700006	0.9997002	"b9^2)/0.0ppm"
+581.854467817	0.5397395	"y10^2/0.0ppm"
+388.23873736700006	0.7890641	"y10^3/0.0ppm"
+551.843905467	1.0	"b10^2/0.0ppm"
+617.373024817	0.54010165	"y11^2/0.0ppm"
+411.91777536700005	0.78956795	"y11^3/0.0ppm"
+600.370287467	0.9995948	"b11^2/0.0ppm"
+695.423580317	0.54046255	"y12^2/0.0ppm"
+463.95147903366666	0.78978145	"y12^3/0.0ppm"
+673.8879849670001	0.9990461	"b12^2/0.0ppm"
\ No newline at end of file

From e3692c5ec266a229d388701ebf7cf9d627f71ae2 Mon Sep 17 00:00:00 2001
From: Siegfried Gessulat <s.gessulat@gmail.com>
Date: Tue, 2 Jul 2019 14:25:41 +0200
Subject: [PATCH 15/18] :lipstick: :books: fix typos

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 4dc40a3..5620121 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ Please find an example input file at `example/peptidelist.csv`. After starting t
 
     curl -F "peptides=@examples/peptidelist.csv" http://127.0.0.1:5000/predict/msms
 
-    The examples takes about 4s to run. Expected output files (.generic, .msp and .msms) can be found in `examples/`.
+The examples take about 4s to run. Expected output files (.generic, .msp and .msms) can be found in `examples/`.
 
 ## Using Prosit on your data
 

From 4a07e7c885657ea582927876b5961015c273bd9a Mon Sep 17 00:00:00 2001
From: Tobias Schmidt <tobias.k.schmidt@tum.de>
Date: Tue, 23 Jul 2019 06:57:20 +0000
Subject: [PATCH 16/18] trained set to true for prediction

---
 prosit/server.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/prosit/server.py b/prosit/server.py
index a4480ca..93dba77 100644
--- a/prosit/server.py
+++ b/prosit/server.py
@@ -88,13 +88,15 @@ def cleanup(response):
         d_spectra["session"] = tf.Session()
         with d_spectra["session"].as_default():
             d_spectra["model"], d_spectra["config"] = model.load(
-                constants.MODEL_SPECTRA
+                constants.MODEL_SPECTRA,
+                trained=True
             )
             d_spectra["model"].compile(optimizer="adam", loss="mse")
     d_irt["graph"] = tf.Graph()
     with d_irt["graph"].as_default():
         d_irt["session"] = tf.Session()
         with d_irt["session"].as_default():
-            d_irt["model"], d_irt["config"] = model.load(constants.MODEL_IRT)
+            d_irt["model"], d_irt["config"] = model.load(constants.MODEL_IRT,
+                    trained=True)
             d_irt["model"].compile(optimizer="adam", loss="mse")
     app.run(host="0.0.0.0")

From 54135c35615dab852ef22dc26f2d3f03b149f2fb Mon Sep 17 00:00:00 2001
From: Michelle Gill <michelle@michellelynngill.com>
Date: Fri, 15 Nov 2019 12:57:32 -0500
Subject: [PATCH 17/18] io to io_local to fix collision w Python3 lib

---
 prosit/__init__.py            | 2 +-
 prosit/{io.py => io_local.py} | 0
 prosit/prediction.py          | 4 ++--
 prosit/server.py              | 2 +-
 prosit/training.py            | 8 ++++----
 5 files changed, 8 insertions(+), 8 deletions(-)
 rename prosit/{io.py => io_local.py} (100%)

diff --git a/prosit/__init__.py b/prosit/__init__.py
index d0e5b19..a642869 100644
--- a/prosit/__init__.py
+++ b/prosit/__init__.py
@@ -1,4 +1,4 @@
-from . import io
+from . import io_local
 from . import constants
 from . import model
 from . import alignment
diff --git a/prosit/io.py b/prosit/io_local.py
similarity index 100%
rename from prosit/io.py
rename to prosit/io_local.py
diff --git a/prosit/prediction.py b/prosit/prediction.py
index 10360d4..45821f7 100644
--- a/prosit/prediction.py
+++ b/prosit/prediction.py
@@ -3,14 +3,14 @@
 import numpy as np
 
 from . import model as model_lib
-from . import io
+from . import io_local
 from . import constants
 from . import sanitize
 
 
 def predict(data, d_model):
     # check for mandatory keys
-    x = io.get_array(data, d_model["config"]["x"])
+    x = io_local.get_array(data, d_model["config"]["x"])
 
     keras.backend.set_session(d_model["session"])
     with d_model["graph"].as_default():
diff --git a/prosit/server.py b/prosit/server.py
index 93dba77..c5c12a3 100644
--- a/prosit/server.py
+++ b/prosit/server.py
@@ -7,7 +7,7 @@
 import tensorflow as tf
 
 from . import model
-from . import io
+from . import io_local
 from . import constants
 from . import tensorize
 from . import prediction
diff --git a/prosit/training.py b/prosit/training.py
index c70d55c..100d84b 100644
--- a/prosit/training.py
+++ b/prosit/training.py
@@ -1,6 +1,6 @@
 import os
 
-from . import io
+from . import io_local
 from . import losses
 from . import model as model_lib
 from . import constants
@@ -28,8 +28,8 @@ def train(tensor, model, model_config, callbacks):
     else:
         loss = losses.get(model_config["loss"])
     optimizer = model_config["optimizer"]
-    x = io.get_array(tensor, model_config["x"])
-    y = io.get_array(tensor, model_config["y"])
+    x = io_local.get_array(tensor, model_config["x"])
+    y = io_local.get_array(tensor, model_config["y"])
     model.compile(optimizer=optimizer, loss=loss)
     model.fit(
         x=x,
@@ -48,6 +48,6 @@ def train(tensor, model, model_config, callbacks):
     model_dir = constants.MODEL_DIR
 
     model, model_config = model_lib.load(model_dir, trained=True)
-    tensor = io.from_hdf5(data_path)
+    tensor = io_local.from_hdf5(data_path)
     callbacks = get_callbacks(model_dir)
     train(tensor, model, model_config, callbacks)

From f69cc63b8319952bb75a0f02889a7d3fa439b52d Mon Sep 17 00:00:00 2001
From: Michelle Gill <michelle@michellelynngill.com>
Date: Fri, 15 Nov 2019 12:58:22 -0500
Subject: [PATCH 18/18] Use memory mapped arrays for data loading

---
 prosit/io_local.py | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/prosit/io_local.py b/prosit/io_local.py
index a28977d..131cb52 100644
--- a/prosit/io_local.py
+++ b/prosit/io_local.py
@@ -14,9 +14,16 @@ def to_hdf5(dictionary, path):
             f.create_dataset(key, data=data, dtype=data.dtype, compression="gzip")
 
 
-def from_hdf5(path):
+def from_hdf5(path, n_samples=None):
+    from keras.utils import HDF5Matrix
     import h5py
-
-    with h5py.File(path, "r") as f:
-        data = {k: f[k][...] for k in f.keys()}
+    
+    # Get a list of the keys for the datasets
+    with h5py.File(path, 'r') as f:
+        dataset_list = list(f.keys())
+    
+    # Assemble into a dictionary
+    data = dict()
+    for dataset in dataset_list:
+        data[dataset] = HDF5Matrix(path, dataset, start=0, end=n_samples, normalizer=None)
     return data