-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathprocess_peptide_binding.py
57 lines (48 loc) · 1.34 KB
/
process_peptide_binding.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
Run `grep` on the appropriate lines from
mhcflurry/4/1.2.0/data_curated/curated_training_data.no_mass_spec.csv
to create a csv file to make a file like "../data/HLA-A_01:01.csv".
This will output a new file "../data/HLA-A_01:01_processed.csv"
Input this into `read_data.py`
"""
import os
import sys
import csv
import numpy as np
import pandas as pd
from extractor import Extractor
from Bio import SeqIO
file_name = "../data/HLA-A_01:01.csv"
#file_name = "../data/HLA-B_08:02.csv"
#file_name = "../data/HLA-B_44:02.csv"
feat_maker = Extractor()
Xs = []
ys = []
with open(file_name, "r") as f:
csv_reader = csv.reader(f)
header = next(csv_reader)
for line in csv_reader:
peptide = line[1]
val = float(line[2])
ineq = line[3]
if ineq != "=":
#print(line)
continue
if len(peptide) != 9:
continue
x = feat_maker.extract(peptide)
Xs.append(x)
ys.append(val)
Xs = np.array(Xs)
ys = np.array(ys).reshape(-1,1)
ys = np.log(ys)
print(ys.max())
print(ys.min())
print(np.median(ys))
print(np.mean(ys))
print(np.var(ys))
out_data = np.concatenate([Xs, ys], axis=1)
print(out_data.shape)
np.savetxt(
file_name.replace(".csv", "_processed.csv"), out_data, delimiter=",",
header=",".join(["x%d" % i for i in range(out_data.shape[0] - 1)] + ["y"]))