From 0d16926aa5f897cc3ad13452c7b942d8d83b1275 Mon Sep 17 00:00:00 2001 From: fcyu Date: Mon, 2 Nov 2015 10:29:26 +0800 Subject: [PATCH] Create --- .gitattributes | 17 + .gitignore | 18 + README.md | 14 + add_jramp | 1 + pom.xml | 62 +++ .../java/proteomics/Index/BuildIndex.java | 252 ++++++++++ src/main/java/proteomics/Index/BuildMap.java | 92 ++++ .../java/proteomics/Index/ChainEntry.java | 22 + src/main/java/proteomics/LogEntry.java | 9 + src/main/java/proteomics/Math/CalScore.java | 43 ++ .../java/proteomics/Parameter/Parameter.java | 86 ++++ .../proteomics/Search/FinalResultEntry.java | 42 ++ .../java/proteomics/Search/PrepareSearch.java | 73 +++ .../java/proteomics/Search/ResultEntry.java | 16 + src/main/java/proteomics/Search/Search.java | 438 ++++++++++++++++++ src/main/java/proteomics/Search/SemiPSM.java | 19 + .../java/proteomics/Search/SpectrumEntry.java | 19 + src/main/java/proteomics/SearchMain.java | 143 ++++++ .../java/proteomics/Spectrum/PreSpectrum.java | 181 ++++++++ .../java/proteomics/Validation/CalFDR.java | 84 ++++ .../Validation/GenerateDecoySeq.java | 97 ++++ src/main/java/theoSeq/DbTool.java | 61 +++ src/main/java/theoSeq/MassTool.java | 243 ++++++++++ src/main/resources/parameter.def | 49 ++ 24 files changed, 2081 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 README.md create mode 100644 add_jramp create mode 100644 pom.xml create mode 100644 src/main/java/proteomics/Index/BuildIndex.java create mode 100644 src/main/java/proteomics/Index/BuildMap.java create mode 100644 src/main/java/proteomics/Index/ChainEntry.java create mode 100644 src/main/java/proteomics/LogEntry.java create mode 100644 src/main/java/proteomics/Math/CalScore.java create mode 100644 src/main/java/proteomics/Parameter/Parameter.java create mode 100644 src/main/java/proteomics/Search/FinalResultEntry.java create mode 100644 src/main/java/proteomics/Search/PrepareSearch.java create mode 100644 src/main/java/proteomics/Search/ResultEntry.java create mode 100644 src/main/java/proteomics/Search/Search.java create mode 100644 src/main/java/proteomics/Search/SemiPSM.java create mode 100644 src/main/java/proteomics/Search/SpectrumEntry.java create mode 100644 src/main/java/proteomics/SearchMain.java create mode 100644 src/main/java/proteomics/Spectrum/PreSpectrum.java create mode 100644 src/main/java/proteomics/Validation/CalFDR.java create mode 100644 src/main/java/proteomics/Validation/GenerateDecoySeq.java create mode 100644 src/main/java/theoSeq/DbTool.java create mode 100644 src/main/java/theoSeq/MassTool.java create mode 100644 src/main/resources/parameter.def diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..bdb0cab --- /dev/null +++ b/.gitattributes @@ -0,0 +1,17 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Custom for Visual Studio +*.cs diff=csharp + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..8ab0a37 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +*.class + + +# Package Files # +*.jar +*.war +*.ear + +# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml +hs_err_pid* + +*.class +nbactions.xml + +target + +.idea +*.iml \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..588374e --- /dev/null +++ b/README.md @@ -0,0 +1,14 @@ +# ExhaustiveCL +A fast and exhaustive cross-linked peptides identification tool. + +## Usage + Usage: java -Xmx32g -jar /path/to/ExhaustiveCL.jar + : parameter file. Can be download along with ExhaustiveCL. + : spectra data file (mzXML) + : result files' directory + example: java -Xmx32g -jar ExhaustiveCL.jar parameter.def data.mzxml result_dir/ + +## Demo data + + +## Cite diff --git a/add_jramp b/add_jramp new file mode 100644 index 0000000..2cdd994 --- /dev/null +++ b/add_jramp @@ -0,0 +1 @@ +mvn install:install-file -Dfile=D:\Dropbox\proteomics\API\jramp\stax\software\jrap_StAX_v5.2.jar -DgroupId=jramp -DartifactId=jramp -Dversion=5.2 -Dpackaging=jar -DgeneratePom=true \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..4956b4f --- /dev/null +++ b/pom.xml @@ -0,0 +1,62 @@ + + 4.0.0 + + proteomics + ECL + 20151029 + jar + + ECL + http://maven.apache.org + + + UTF-8 + 1.7 + 1.7 + + + + + org.xerial + sqlite-jdbc + 3.8.9.1 + + + jramp + jramp + 5.2 + + + + + + + maven-assembly-plugin + 2.4 + + + jar-with-dependencies + + + + proteomics.SearchMain + + + + false + + + + + make-assembly + package + + single + + + + + + + diff --git a/src/main/java/proteomics/Index/BuildIndex.java b/src/main/java/proteomics/Index/BuildIndex.java new file mode 100644 index 0000000..ec778e5 --- /dev/null +++ b/src/main/java/proteomics/Index/BuildIndex.java @@ -0,0 +1,252 @@ +package proteomics.Index; + +import java.util.*; +import java.sql.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import theoSeq.*; +import proteomics.Validation.*; + +public class BuildIndex { + + private static final String TARGET = "1"; + private static final String DECOY = "0"; + + private double hatom_mass = 0; + private double oatom_mass = 0; + private float max_precursor_mass = 0; + private int min_chain_length = 0; + private int max_chain_length = 0; + private String db_path = ""; + private MassTool mass_tool_obj = null; + private Map pro_seq_map = null; + final private Map seq_mass_map = new HashMap<>(); + final private Map> seq_pro_map = new HashMap<>(); + final private Set for_check_duplitate = new HashSet<>(); + final private Map seq_term_type_map = new HashMap<>(); + final private Map decoy_seq_mass_map = new HashMap<>(); + final private Map> decoy_seq_pro_map = new HashMap<>(); + final private Map decoy_seq_term_type_map = new HashMap<>(); + private double nterm_mass = 0; + + /////////////////////////////////public methods////////////////////////////////////////////////////////////////// + public BuildIndex(Map parameter_map, Map fix_mod_map) throws Exception { + // initialize parameters + max_precursor_mass = Float.valueOf(parameter_map.get("max_precursor_mass")); + min_chain_length = Integer.valueOf(parameter_map.get("min_chain_length")); + max_chain_length = Integer.valueOf(parameter_map.get("max_chain_length")); + String decoy_db = parameter_map.get("decoy_db"); + db_path = parameter_map.get("db"); + int missed_cleavage = Integer.valueOf(parameter_map.get("missed_cleavage")); + String cl_aa = parameter_map.get("cl_aa"); + int nterm_linkable = Integer.valueOf(parameter_map.get("nterm_linkable")); + + // read protein database + DbTool db_tool_obj = new DbTool(db_path); + pro_seq_map = db_tool_obj.returnSeqMap(); + + // define a new MassTool object + mass_tool_obj = new MassTool(missed_cleavage, fix_mod_map, nterm_linkable); + Map mass_table = mass_tool_obj.returnMassTable(); + hatom_mass = mass_table.get("Hatom"); + oatom_mass = mass_table.get("Oatom"); + nterm_mass = mass_table.get("Z"); + } + + /////////////////////////////////////public methods//////////////////////////////////////////////////////////////////// + public void buildCLDb(Map parameter_map) throws Exception { + buildPepChainMap(); + buildDecoyPepChainMap(); + + // Create index SQL database + Connection index_conn = null; + Statement index_statement = null; + try { + Class.forName("org.sqlite.JDBC").newInstance(); + index_conn = DriverManager.getConnection("jdbc:sqlite:" + ":memory:"); + index_statement = index_conn.createStatement(); + index_statement.executeUpdate("create table peptide_chain (chain_index integer primary key asc, mass real not null, peptide_sequence text not null, protein_id text not null, type text not null, term_type integer not null);"); + index_statement.executeUpdate("create index mass on peptide_chain (mass);"); + index_statement.executeUpdate("create index seq on peptide_chain (peptide_sequence);"); + } catch (SQLException ex) { + System.err.println("SQLException: " + ex.getMessage()); + System.exit(1); + } + + // Build peptide_chain table + // It contains target and decoy sequence + PreparedStatement index_prepared_statement = index_conn.prepareStatement("insert into peptide_chain (mass, peptide_sequence, protein_id, type, term_type) values (?, ?, ?, ?, ?);"); + try { + index_conn.setAutoCommit(false); + Set seq_set = seq_pro_map.keySet(); + for (String seq : seq_set) { + if ((seq.length() < min_chain_length) || (seq.length() > max_chain_length)) { + continue; + } + + index_prepared_statement.setFloat(1, seq_mass_map.get(seq)); + index_prepared_statement.setString(2, seq); + Set pro_id_set = seq_pro_map.get(seq); + String temp = pro_id_set.toString().replace(", ", "&"); + String temp_2 = temp.substring(1, temp.length() - 1); + index_prepared_statement.setString(3, temp_2); + index_prepared_statement.setString(4, TARGET); + index_prepared_statement.setInt(5, seq_term_type_map.get(seq)); + index_prepared_statement.executeUpdate(); + } + index_conn.commit(); + } catch (SQLException ex) { + System.err.println("SQLException: " + ex.getMessage()); + System.exit(1); + } finally { + index_prepared_statement.close(); + index_conn.setAutoCommit(true); + } + + index_prepared_statement = index_conn.prepareStatement("insert into peptide_chain (mass, peptide_sequence, protein_id, type, term_type) values (?, ?, ?, ?, ?);"); + try { + index_conn.setAutoCommit(false); + Set seq_set = decoy_seq_pro_map.keySet(); + for (String seq : seq_set) { + if ((seq.length() < min_chain_length) || (seq.length() > max_chain_length)) { + continue; + } + + index_prepared_statement.setFloat(1, decoy_seq_mass_map.get(seq)); + index_prepared_statement.setString(2, seq); + Set pro_id_set = decoy_seq_pro_map.get(seq); + String temp = pro_id_set.toString().replace(", ", "&"); + String temp_2 = temp.substring(1, temp.length() - 1); + index_prepared_statement.setString(3, temp_2); + index_prepared_statement.setString(4, DECOY); + index_prepared_statement.setInt(5, decoy_seq_term_type_map.get(seq)); + index_prepared_statement.executeUpdate(); + } + index_conn.commit(); + } catch (SQLException ex) { + System.err.println("SQLException: " + ex.getMessage()); + System.exit(1); + } finally { + index_prepared_statement.close(); + index_conn.setAutoCommit(true); + } + + // Backup SQL database to disk + try { + index_statement.executeUpdate("backup to " + db_path + ".db"); + } catch (SQLException ex) { + System.err.println("SQLException: " + ex.getMessage()); + System.exit(1); + } + + index_statement.close(); + index_conn.close(); + } + + public MassTool returnMassToolObj() { + return mass_tool_obj; + } + + //////////////////////////////////////////private methods//////////////////////////////////////////////////////// + private void buildPepChainMap() { + Set pro_id_set = pro_seq_map.keySet(); + for (String pro_id : pro_id_set) { + String pro_seq = pro_seq_map.get(pro_id); + Set seq_set = mass_tool_obj.buildChainSet(pro_seq); + for (String seq : seq_set) { + float mass_temp = (float) mass_tool_obj.calResidueMass(seq) + (float) nterm_mass + 2 * (float) hatom_mass + (float) oatom_mass; // calMass just calculate the residue mass, so we should add a H2O + if (mass_temp <= max_precursor_mass) { + seq_mass_map.put(seq, mass_temp); + + // Add the sequence to the check set for decoy duplicate check + String template_seq = seq.replace("L", "I"); // "L" and "I" have the same mass. + template_seq = template_seq.replace("K", "Q"); // "K" and "Q" have the close mass. + for_check_duplitate.add(template_seq); + + if (pro_seq.startsWith(seq)) { + seq_term_type_map.put(seq, 1); + } else if (pro_seq.endsWith(seq)) { + seq_term_type_map.put(seq, 2); + } else { + seq_term_type_map.put(seq, 0); + } + + if (seq_pro_map.containsKey(seq)) { + Set pro_list = seq_pro_map.get(seq); + pro_list.add(pro_id); + seq_pro_map.put(seq, pro_list); + } else { + Set pro_list = new HashSet<>(); + pro_list.add(pro_id); + seq_pro_map.put(seq, pro_list); + } + } + } + } + } + + private void buildDecoyPepChainMap() throws Exception { + Set seq_set = seq_pro_map.keySet(); + for (String original_seq : seq_set) { + String decoy_seq = ""; + if (original_seq.endsWith("K") || original_seq.endsWith("R")) { + String temp = reverseSeq(original_seq.substring(0, original_seq.length() - 1)); + decoy_seq = temp + original_seq.charAt(original_seq.length() - 1); + } else { + decoy_seq = reverseSeq(original_seq); + } + + // Check duplicate + String new_decoy_seq = decoy_seq.replace("L", "I"); + new_decoy_seq = new_decoy_seq.replace("K", "Q"); + if (for_check_duplitate.contains(new_decoy_seq)) { + // the decoy sequence is the same as the target sequence + continue; + } + + float decoy_mass_temp = seq_mass_map.get(original_seq); + decoy_seq_mass_map.put(decoy_seq, decoy_mass_temp); + String pro_id = seq_pro_map.get(original_seq).iterator().next(); + String decoy_pro_id = "DECOY_" + pro_id; + String pro_seq = pro_seq_map.get(pro_id); + if (pro_seq.startsWith(original_seq)) { + decoy_seq_term_type_map.put(decoy_seq, 1); + } else if (pro_seq.endsWith(original_seq)) { + decoy_seq_term_type_map.put(decoy_seq, 2); + } else { + decoy_seq_term_type_map.put(decoy_seq, 0); + } + + Set decoy_pro_set = new HashSet<>(); + decoy_pro_set.add(decoy_pro_id); + decoy_seq_pro_map.put(decoy_seq, decoy_pro_set); + } + } + + private String reverseSeq(String sequence) { + Pattern fix_pattern = Pattern.compile("[K]"); + String decoy_str = ""; + Matcher fix_matcher = fix_pattern.matcher(sequence); + int sequence_length = sequence.length(); + int idx_1 = 0; + int idx_2; + while (idx_1 < sequence_length) { + String fix_aa; + if (fix_matcher.find()) { + idx_2 = fix_matcher.start(); + fix_aa = sequence.substring(idx_2, idx_2 + 1); + } else { + idx_2 = sequence_length; + fix_aa = ""; + } + String part = sequence.substring(idx_1, idx_2); + + // Reverse part sequence + decoy_str += new StringBuilder(part).reverse().toString() + fix_aa; + idx_1 = idx_2 + 1; + } + + return decoy_str; + } +} diff --git a/src/main/java/proteomics/Index/BuildMap.java b/src/main/java/proteomics/Index/BuildMap.java new file mode 100644 index 0000000..f08fbc5 --- /dev/null +++ b/src/main/java/proteomics/Index/BuildMap.java @@ -0,0 +1,92 @@ +package proteomics.Index; + +import theoSeq.MassTool; + +import java.sql.*; +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class BuildMap { + + private Map chain_entry_map = new HashMap<>(); + private Map index_chain_map = new HashMap<>(); + + public BuildMap(Map parameter_map, Map fix_mod_map) throws Exception { + int nterm_linkable = Integer.valueOf(parameter_map.get("nterm_linkable")); + MassTool mass_tool_obj = new MassTool(Integer.valueOf(parameter_map.get("missed_cleavage")), fix_mod_map, nterm_linkable); + int[] temp = string2Array(parameter_map.get("common_ion_charge")); + int min_charge = temp[0]; + temp = string2Array(parameter_map.get("xlink_ion_charge")); + int max_charge = temp[temp.length - 1]; + + Pattern link_site_pattern = Pattern.compile(parameter_map.get("cl_aa") + "(?!$)"); + + // Create a index SQL connection and read the database from disk. + Connection index_conn = null; + Statement index_statement = null; + ResultSet index_rs = null; + try { + Class.forName("org.sqlite.JDBC").newInstance(); + index_conn = DriverManager.getConnection("jdbc:sqlite::memory:"); + index_statement = index_conn.createStatement(); + index_statement.executeUpdate("restore from " + parameter_map.get("db") + ".db"); + } catch (SQLException ex) { + System.err.println(ex.getMessage()); + System.exit(1); + } + + // Extract chain sequence and mass from peptide_chain table for further use + index_rs = index_statement.executeQuery("select chain_index, mass, peptide_sequence, protein_id, type, term_type from peptide_chain;"); + while (index_rs.next()) { + String chain_index = String.valueOf(index_rs.getInt(1)) + "-"; //TODO: Left for vairable modification + float chain_mass = index_rs.getFloat(2); + String chain_seq = index_rs.getString(3); + String protein_id = index_rs.getString(4); + String chain_type = index_rs.getString(5); + int term_type = index_rs.getInt(6); + double[][] chain_ion_array = mass_tool_obj.buildChainIonArray(chain_seq, min_charge, max_charge); + + index_chain_map.put(chain_index, chain_seq); + + // Generate a chain link site map for further use. + Matcher link_site_matcher = link_site_pattern.matcher(chain_seq); + List link_site_list = new LinkedList<>(); + while (link_site_matcher.find()) { + int link_site = link_site_matcher.start(); + link_site_list.add(link_site); + } + + if ((term_type == 1) && !link_site_list.contains(0) && (nterm_linkable == 1)) { + link_site_list.add(0); + } + + ChainEntry chain_entry = new ChainEntry(chain_mass, chain_index, protein_id, chain_type, link_site_list, chain_ion_array); + chain_entry_map.put(chain_seq, chain_entry); + } + + index_statement.close(); + index_conn.close(); + } + + public Map returnChainEntryMap() { + return chain_entry_map; + } + + public Map returnIndexChainMap() { + return index_chain_map; + } + + private int[] string2Array(String str) { + String[] temp = str.split(","); + int[] output = new int[temp.length]; + for (int i = 0; i < output.length; ++i) { + output[i] = Integer.valueOf(temp[i]); + } + + return output; + } +} diff --git a/src/main/java/proteomics/Index/ChainEntry.java b/src/main/java/proteomics/Index/ChainEntry.java new file mode 100644 index 0000000..9b4923a --- /dev/null +++ b/src/main/java/proteomics/Index/ChainEntry.java @@ -0,0 +1,22 @@ +package proteomics.Index; + +import java.util.*; + +public class ChainEntry { + + public float chain_mass = 0; + public String chain_index = ""; + public String pro_id = null; + public String chain_type = null; + public List linksite_list = null; + public double[][] chain_ion_array = null; + + public ChainEntry(float chain_mass, String chain_index, String pro_id, String chain_type, List linksite_list, double[][] chain_ion_array) { + this.chain_mass = chain_mass; + this.chain_index = chain_index; + this.pro_id = pro_id; + this.chain_type = chain_type; + this.linksite_list = linksite_list; + this.chain_ion_array = chain_ion_array; + } +} \ No newline at end of file diff --git a/src/main/java/proteomics/LogEntry.java b/src/main/java/proteomics/LogEntry.java new file mode 100644 index 0000000..0ed7f69 --- /dev/null +++ b/src/main/java/proteomics/LogEntry.java @@ -0,0 +1,9 @@ +package proteomics; + +public class LogEntry { + public String output_str = null; + + public LogEntry(String output_str) { + this.output_str = output_str; + } +} \ No newline at end of file diff --git a/src/main/java/proteomics/Math/CalScore.java b/src/main/java/proteomics/Math/CalScore.java new file mode 100644 index 0000000..a7f49a2 --- /dev/null +++ b/src/main/java/proteomics/Math/CalScore.java @@ -0,0 +1,43 @@ +package proteomics.Math; + + +public class CalScore { + + private double[][] alignment_matrix = null; + + public CalScore(double[][] exp_matrix, double[] theo_vector, double xcorr_tolerance) { + alignment_matrix = new double[2][exp_matrix[0].length + theo_vector.length]; + + int start = 0; + int idx = 0; + for (double theo_mz : theo_vector) { + if (exp_matrix[0][0] - theo_mz > xcorr_tolerance) { + continue; + } + + int j = start; + while (j < exp_matrix[0].length) { + double exp_mz = exp_matrix[0][j]; + double temp = exp_mz - theo_mz; + if ((temp >= -xcorr_tolerance) && (temp <= xcorr_tolerance)) { + alignment_matrix[0][idx] = exp_mz; + alignment_matrix[1][idx] = exp_matrix[1][j]; + ++idx; + } else if (temp > xcorr_tolerance) { + start = j - 1; + break; + } + ++j; + } + } + } + + public double cal_dot_product() { + double dot_value = 0; + for (int i = 0; i < alignment_matrix[0].length; ++i) { + dot_value += alignment_matrix[1][i]; + } + + return dot_value; + } +} \ No newline at end of file diff --git a/src/main/java/proteomics/Parameter/Parameter.java b/src/main/java/proteomics/Parameter/Parameter.java new file mode 100644 index 0000000..5cf7bb6 --- /dev/null +++ b/src/main/java/proteomics/Parameter/Parameter.java @@ -0,0 +1,86 @@ +package proteomics.Parameter; + +import java.io.*; +import java.util.*; +import java.util.regex.*; + +public class Parameter { + private String parameter_file = "./src/main/resources/parameter.def"; + private Map parameter_map = new HashMap<>(); + + public Parameter() throws Exception { + Pattern comment_line_pattern = Pattern.compile("^#.*"); + Pattern comment_pattern = Pattern.compile("([^#]+)#?.*"); + + BufferedReader parameter_reader = new BufferedReader(new FileReader(parameter_file)); + try { + String line = ""; + while ((line = parameter_reader.readLine()) != null) { + line = line.trim(); + Matcher comment_line_matcher = comment_line_pattern.matcher(line); + if (!comment_line_matcher.matches()) { + // This is not a comment line + Matcher line_matcher = comment_pattern.matcher(line); + String parameter_string; + if (line_matcher.matches()) { + parameter_string = line_matcher.group(1).trim(); + String[] parameter_array = parameter_string.split("="); + String parameter_name = parameter_array[0].trim(); + String parameter_value = parameter_array[1].trim(); + parameter_map.put(parameter_name, parameter_value); + } + } + } + } catch (IOException ex) { + System.err.println("IOException: " + ex.getMessage()); + System.exit(1); + } catch (IllegalStateException ex) { + System.err.println("IllegalStateException: " + ex.getMessage()); + System.exit(1); + } finally { + parameter_reader.close(); + } + } + + + public Parameter(String parameter_file) throws Exception { + this.parameter_file = parameter_file; + + Pattern comment_line_pattern = Pattern.compile("^#.*"); + Pattern comment_pattern = Pattern.compile("([^#]+)#?.*"); + + BufferedReader parameter_reader = new BufferedReader(new FileReader(parameter_file)); + try { + String line = ""; + while ((line = parameter_reader.readLine()) != null) { + line = line.trim(); + Matcher comment_line_matcher = comment_line_pattern.matcher(line); + if (!comment_line_matcher.matches()) { + // This is not a comment line + Matcher line_matcher = comment_pattern.matcher(line); + String parameter_string; + if (line_matcher.matches()) { + parameter_string = line_matcher.group(1).trim(); + String[] parameter_array = parameter_string.split("="); + String parameter_name = parameter_array[0].trim(); + String parameter_value = parameter_array[1].trim(); + parameter_map.put(parameter_name, parameter_value); + } + } + } + } catch (IOException ex) { + System.err.println("IOException: " + ex.getMessage()); + System.exit(1); + } catch (IllegalStateException ex) { + System.err.println("IllegalStateException: " + ex.getMessage()); + System.exit(1); + } finally { + parameter_reader.close(); + } + } + + + public Map returnParameterMap() { + return parameter_map; + } +} \ No newline at end of file diff --git a/src/main/java/proteomics/Search/FinalResultEntry.java b/src/main/java/proteomics/Search/FinalResultEntry.java new file mode 100644 index 0000000..a57f8bb --- /dev/null +++ b/src/main/java/proteomics/Search/FinalResultEntry.java @@ -0,0 +1,42 @@ +package proteomics.Search; + +public class FinalResultEntry { + + public int spectrum_id = 0; + public String cl_index = null; + public int rank = 0; + public int charge = 0; + public float spectrum_precursor_mz = 0; + public float abs_ppm = 0; + public double xcorr = 0; + public double delta_xcorr = 0; + public String sequence_1 = null; + public String mod_1 = null; + public String protein_id_1 = null; + public String sequence_2 = null; + public String mod_2 = null; + public String protein_id_2 = null; + public String cl_type = null; + public String type = null; + public double qvalue = 0; + + public FinalResultEntry(int spectrum_id, String cl_index, int rank, int charge, float spectrum_precursor_mz, float abs_ppm, double xcorr, double delta_xcorr, String sequence_1, String mod_1, String protein_id_1, String sequence_2, String mod_2, String protein_id_2, String cl_type, String type, double qvalue) { + this.spectrum_id = spectrum_id; + this.cl_index = cl_index; + this.rank = rank; + this.charge = charge; + this.spectrum_precursor_mz= spectrum_precursor_mz; + this.abs_ppm = abs_ppm; + this.xcorr = xcorr; + this.delta_xcorr = delta_xcorr; + this.sequence_1 = sequence_1; + this.mod_1 = mod_1; + this.protein_id_1 = protein_id_1; + this.sequence_2 = sequence_2; + this.mod_2 = mod_2; + this.protein_id_2 = protein_id_2; + this.cl_type = cl_type; + this.type = type; + this.qvalue = qvalue; + } +} diff --git a/src/main/java/proteomics/Search/PrepareSearch.java b/src/main/java/proteomics/Search/PrepareSearch.java new file mode 100644 index 0000000..8717d07 --- /dev/null +++ b/src/main/java/proteomics/Search/PrepareSearch.java @@ -0,0 +1,73 @@ +package proteomics.Search; + +import proteomics.Index.*; +import java.util.*; +import theoSeq.*; + +public class PrepareSearch { + + private MassTool mass_tool_obj = null; + private Map parameter_map = null; + private Map index_chain_map = null; + private Map chain_entry_map = null; + private Map fix_mod_map = new HashMap<>(); + + /////////////////////////////public methods/////////////////////////////////////////////////////////////////// + public PrepareSearch(Map parameter_map) throws Exception { + this.parameter_map = parameter_map; + + // Read fix modification + fix_mod_map.put("G", Double.valueOf(parameter_map.get("G"))); + fix_mod_map.put("A", Double.valueOf(parameter_map.get("A"))); + fix_mod_map.put("S", Double.valueOf(parameter_map.get("S"))); + fix_mod_map.put("P", Double.valueOf(parameter_map.get("P"))); + fix_mod_map.put("V", Double.valueOf(parameter_map.get("V"))); + fix_mod_map.put("T", Double.valueOf(parameter_map.get("T"))); + fix_mod_map.put("C", Double.valueOf(parameter_map.get("C"))); + fix_mod_map.put("I", Double.valueOf(parameter_map.get("I"))); + fix_mod_map.put("L", Double.valueOf(parameter_map.get("L"))); + fix_mod_map.put("N", Double.valueOf(parameter_map.get("N"))); + fix_mod_map.put("D", Double.valueOf(parameter_map.get("D"))); + fix_mod_map.put("Q", Double.valueOf(parameter_map.get("Q"))); + fix_mod_map.put("K", Double.valueOf(parameter_map.get("K"))); + fix_mod_map.put("E", Double.valueOf(parameter_map.get("E"))); + fix_mod_map.put("M", Double.valueOf(parameter_map.get("M"))); + fix_mod_map.put("H", Double.valueOf(parameter_map.get("H"))); + fix_mod_map.put("F", Double.valueOf(parameter_map.get("F"))); + fix_mod_map.put("R", Double.valueOf(parameter_map.get("R"))); + fix_mod_map.put("Y", Double.valueOf(parameter_map.get("Y"))); + fix_mod_map.put("W", Double.valueOf(parameter_map.get("W"))); + fix_mod_map.put("Z", Double.valueOf(parameter_map.get("Z"))); + + BuildIndex build_index_obj = new BuildIndex(parameter_map, fix_mod_map); + mass_tool_obj = build_index_obj.returnMassToolObj(); + + // Checking whether building a SQL database. + System.out.println("Building database index..."); + build_index_obj.buildCLDb(parameter_map); + + BuildMap build_map_obj = new BuildMap(parameter_map, fix_mod_map); + index_chain_map = build_map_obj.returnIndexChainMap(); + chain_entry_map = build_map_obj.returnChainEntryMap(); + } + + public MassTool returnMassTool() { + return mass_tool_obj; + } + + public Map returnParameterMap() { + return parameter_map; + } + + public Map returnIndexChainMap() { + return index_chain_map; + } + + public Map returnChainEntryMap() { + return chain_entry_map; + } + + public Map returnFixModMap() { + return fix_mod_map; + } +} diff --git a/src/main/java/proteomics/Search/ResultEntry.java b/src/main/java/proteomics/Search/ResultEntry.java new file mode 100644 index 0000000..123e1bb --- /dev/null +++ b/src/main/java/proteomics/Search/ResultEntry.java @@ -0,0 +1,16 @@ +package proteomics.Search; + +public class ResultEntry { + + public float abs_ppm = 0; + public double xcorr = 0; + public String cl_index = null; + public double scond_xcorr = 0; + + public ResultEntry(float abs_ppm, double xcorr, double second_xcorr, String cl_index) { + this.abs_ppm = abs_ppm; + this.xcorr = xcorr; + this.cl_index = cl_index; + this.scond_xcorr = second_xcorr; + } +} diff --git a/src/main/java/proteomics/Search/Search.java b/src/main/java/proteomics/Search/Search.java new file mode 100644 index 0000000..1a38f6a --- /dev/null +++ b/src/main/java/proteomics/Search/Search.java @@ -0,0 +1,438 @@ +package proteomics.Search; + +import org.systemsbiology.jrap.stax.MSXMLParser; +import org.systemsbiology.jrap.stax.Scan; +import org.systemsbiology.jrap.stax.ScanHeader; +import proteomics.Index.ChainEntry; +import proteomics.LogEntry; +import proteomics.Math.CalScore; +import proteomics.Spectrum.PreSpectrum; +import theoSeq.MassTool; + +import java.io.File; +import java.io.FileNotFoundException; +import java.util.*; + +public class Search { + + private static final double PROTON_MASS = 1.00727646688; + + private LogEntry log_entry = null; + private int[] ms1_charge = null; + private int[] common_ion_charge = null; + private int[] xlink_ion_charge = null; + private float ms1_tolerance = 0; + private int ms1_tolerance_unit = 0; + private double ms2_tolerance = 0; + private double linker_mass = 0; + private float min_precursor_mass = 0; + private float max_precursor_mass = 0; + private int min_peak_num = 0; + private Map index_chain_map = null; + private Map chain_entry_map = null; + private Map fix_mod_map = null; + private MassTool mass_tool = null; + private String output_str = null; + private Map num_spectrum_map = new HashMap<>(); + private float single_dot_product_t = 1e-6f; + private TreeMap> mass1000_spectrum_map = new TreeMap<>(); + private TreeMap> mass1000_chain_map = new TreeMap<>(); + private int linker_mass1000 = 0; + + /////////////////////////////////////////public methods//////////////////////////////////////////////////////////// + public Search(PrepareSearch ps, LogEntry log_entry) throws Exception { + this.log_entry = log_entry; + + Map parameter_map = ps.returnParameterMap(); + index_chain_map = ps.returnIndexChainMap(); + chain_entry_map = ps.returnChainEntryMap(); + fix_mod_map = ps.returnFixModMap(); + mass_tool = ps.returnMassTool(); + ms1_charge = string2Array(parameter_map.get("ms1_charge")); + common_ion_charge = string2Array(parameter_map.get("common_ion_charge")); + xlink_ion_charge = string2Array(parameter_map.get("xlink_ion_charge")); + ms1_tolerance_unit = Integer.valueOf(parameter_map.get("ms1_tolerance_unit")); + ms1_tolerance = Float.valueOf(parameter_map.get("ms1_tolerance")); + ms2_tolerance = Double.valueOf(parameter_map.get("ms2_tolerance")); + String cl_aa = parameter_map.get("cl_aa"); + linker_mass = Double.valueOf(parameter_map.get("cl_mass")) - 2 * Double.valueOf(parameter_map.get(cl_aa)); // TODO: different link sites have different mod. The compensation should be different. + linker_mass1000 = round1000((float) linker_mass); + min_precursor_mass = Float.valueOf(parameter_map.get("min_precursor_mass")); + max_precursor_mass = Float.valueOf(parameter_map.get("max_precursor_mass")); + min_peak_num = Integer.valueOf(parameter_map.get("min_peak_num")); + } + + public List doSearch(String msxml_path) throws Exception { + // Read mzxml + MSXMLParser msxml_parser = null; + try { + File f = new File(msxml_path); + if ((!f.exists() || (f.isDirectory()))) { + FileNotFoundException file_error = new FileNotFoundException("mzXML file not found."); + throw file_error; + } + msxml_parser = new MSXMLParser(msxml_path); + } catch (FileNotFoundException ex) { + System.err.println(ex.getMessage()); + System.exit(1); + } + + // Get current time + double time_spectra_start = System.nanoTime(); + System.out.println("Reading and processing spectra..."); + for (int j = 1; j <= msxml_parser.getMaxScanNumber(); ++j) { + Scan scan = msxml_parser.rap(j); + ScanHeader header = scan.getHeader(); + + if (header.getMsLevel() == 1) { + continue; + } + + if (header.getPrecursorCharge() == -1) { + continue; + } + + int precursor_charge = header.getPrecursorCharge(); + if ((precursor_charge < ms1_charge[0]) || (precursor_charge > ms1_charge[ms1_charge.length - 1])) { + continue; + } + + float precursor_mz = header.getPrecursorMz(); + float precursor_mass = precursor_mz * precursor_charge - precursor_charge * (float) PROTON_MASS; + if ((precursor_mass > max_precursor_mass + 1) || (precursor_mass < min_precursor_mass - 1)) { + continue; + } + + double[][] raw_mz_intensity_array = scan.getMassIntensityList(); + if (raw_mz_intensity_array[0].length < min_peak_num) { + continue; + } + + double[][] mz_intensity_array = PreSpectrum.preProcessSpec(raw_mz_intensity_array, precursor_mass, precursor_charge, ms2_tolerance, precursor_mass); + if (mz_intensity_array[0].length <= min_peak_num) { + continue; + } + + SpectrumEntry spectrum_entry = new SpectrumEntry(j, header.getPrecursorIntensity(), precursor_mz, precursor_mass, precursor_charge, mz_intensity_array); + + if (mass1000_spectrum_map.containsKey(round1000(precursor_mass))) { + List spectrum_list = mass1000_spectrum_map.get(round1000(precursor_mass)); + spectrum_list.add(spectrum_entry); + mass1000_spectrum_map.put(round1000(precursor_mass), spectrum_list); + } else { + List spectrum_list = new LinkedList<>(); + spectrum_list.add(spectrum_entry); + mass1000_spectrum_map.put(round1000(precursor_mass), spectrum_list); + } + + num_spectrum_map.put(j, spectrum_entry); + } + + double time_spectra_end = System.nanoTime(); + double spectra_duration = (time_spectra_end - time_spectra_start) * 1e-9; + System.out.println("Duration: " + (int) spectra_duration + " seconds"); + output_str += "Duration: " + (int) spectra_duration + " seconds" + "\r\n"; + + // Generate a mass chain map. + Set chain_set_1 = chain_entry_map.keySet(); + for (String chain : chain_set_1) { + ChainEntry chain_entry = chain_entry_map.get(chain); + int mass1000 = round1000(chain_entry.chain_mass); + if (mass1000_chain_map.containsKey(mass1000)) { + Set temp = mass1000_chain_map.get(mass1000); + temp.add(chain); + mass1000_chain_map.put(mass1000, temp); + } else { + Set temp = new HashSet<>(); + temp.add(chain); + mass1000_chain_map.put(mass1000, temp); + } + } + + // Get current time + Map result_map = new HashMap<>(); + double time_search_start = System.nanoTime(); + System.out.println("Searching cross-linked peptides..."); + output_str += "Searching cross-linked peptides..." + "\r\n"; + + Integer[] mass1000_array = mass1000_chain_map.keySet().toArray(new Integer[mass1000_chain_map.size()]); + int max_spectrum_mass1000 = mass1000_spectrum_map.lastKey(); + int mass1000_1_max = (max_spectrum_mass1000 - linker_mass1000) / 2; + int mass1000_1_max_idx = 0; + for (int i = 0; i < mass1000_array.length; ++i) { + if (mass1000_array[i] >= mass1000_1_max) { + mass1000_1_max_idx = i; + break; + } + } + int last_progress = 0; + float step = 0; + float total = (float) mass1000_1_max_idx; + for (int i = 0; i <= mass1000_1_max_idx; ++i) { + ++step; + int progress = (int) Math.floor(step * 10 / total); + if (progress != last_progress) { + System.out.print(progress + "0% "); + last_progress = progress; + } + + int mass1000_1 = mass1000_array[i]; + + int start_mass = 2 * mass1000_1 + linker_mass1000; + if (ms1_tolerance_unit == 0) { + start_mass -= round1000(ms1_tolerance); + } else if (ms1_tolerance_unit == 1) { + start_mass -= (int) (start_mass * ms1_tolerance / 1e6f); + } + if (start_mass >= max_spectrum_mass1000) { + break; + } + + NavigableMap> sub_spectra_map = mass1000_spectrum_map.subMap(start_mass, true, max_spectrum_mass1000, true); + if (sub_spectra_map.size() == 0) { + continue; + } + + TreeMap> exp_mass1000_2_semiPSM_map = new TreeMap<>(); + chain_set_1 = mass1000_chain_map.get(mass1000_1); + for (String chain_1 : chain_set_1) { + ChainEntry chain_entry_1 = this.chain_entry_map.get(chain_1); + List link_site_list_1 = chain_entry_1.linksite_list; + double[][] seq_ion_1 = chain_entry_1.chain_ion_array; + linearSearch(sub_spectra_map, mass1000_1, chain_entry_1.chain_index, seq_ion_1, link_site_list_1, exp_mass1000_2_semiPSM_map); + } + + if (exp_mass1000_2_semiPSM_map.size() == 0) { + continue; + } + + Set exp_mass1000_2_set = exp_mass1000_2_semiPSM_map.keySet(); + for (int exp_mass1000_2 : exp_mass1000_2_set) { + int exp_mass1000 = mass1000_1 + exp_mass1000_2 + linker_mass1000; + int mass1000_2_left = 0; + int mass1000_2_right = 0; + if (ms1_tolerance_unit == 0) { + mass1000_2_left = exp_mass1000_2 - round1000(ms1_tolerance); + mass1000_2_right = exp_mass1000_2 + round1000(ms1_tolerance); + } else if (ms1_tolerance_unit == 1) { + mass1000_2_left = (int) (exp_mass1000 / (1 + ms1_tolerance / 1e6)) - mass1000_1 - linker_mass1000; + mass1000_2_right = (int) (exp_mass1000 / (1 - ms1_tolerance / 1e6)) - mass1000_1 - linker_mass1000; + } + NavigableMap> mass1000_2_map = mass1000_chain_map.subMap(mass1000_2_left, true, mass1000_2_right, true); + List semi_psm_list = exp_mass1000_2_semiPSM_map.get(exp_mass1000_2); + Set mass1000_2_set = mass1000_2_map.keySet(); + for (int mass1000_2 : mass1000_2_set) { + Set chain_set_2 = mass1000_chain_map.get(mass1000_2); + for (String chain_2 : chain_set_2) { + ChainEntry chain_entry_2 = chain_entry_map.get(chain_2); + double[][] seq_ion_2 = chain_entry_2.chain_ion_array; + List link_site_list_2 = chain_entry_2.linksite_list; + for (int link_site_2 : link_site_list_2) { + double[][] pseudo_ion_array = mass_tool.buildPseudoCLIonArray(seq_ion_2, link_site_2, common_ion_charge, xlink_ion_charge, back1000(mass1000_1) + (float) linker_mass); + Map charge_theo_mz_map = new HashMap<>(); + double[] theo_mz_2; + for (SemiPSM semi_psm : semi_psm_list) { + int scan_num = semi_psm.scan_num; + SpectrumEntry spectrum_entry = num_spectrum_map.get(scan_num); + int precursor_charge = spectrum_entry.precursor_charge; + if (charge_theo_mz_map.containsKey(precursor_charge)) { + theo_mz_2 = charge_theo_mz_map.get(precursor_charge); + } else { + theo_mz_2 = mass_tool.buildVector(pseudo_ion_array, precursor_charge, common_ion_charge[0]); + charge_theo_mz_map.put(precursor_charge, theo_mz_2); + } + CalScore cal_score_2 = new CalScore(spectrum_entry.mz_intensity_array, theo_mz_2, ms2_tolerance); + double dot_product_2 = cal_score_2.cal_dot_product(); + if (dot_product_2 <= single_dot_product_t) { + continue; + } + + // Calculate final score + double xcorr = (semi_psm.dot_product + dot_product_2) / Math.sqrt(semi_psm.ion_num + theo_mz_2.length); + + // record result + if (result_map.containsKey(scan_num)) { + ResultEntry last_result = result_map.get(scan_num); + if (xcorr > last_result.xcorr) { + float total_mass = back1000(mass1000_1 + mass1000_2) + (float) linker_mass; + float abs_ppm = (float) (Math.abs(spectrum_entry.precursor_mass - total_mass) * 1e6 / total_mass); + String cl_index = semi_psm.chain_index + "-" + chain_entry_2.chain_index + "-" + semi_psm.link_site + "-" + link_site_2; + ResultEntry result_entry = new ResultEntry(abs_ppm, xcorr, last_result.xcorr, cl_index); + result_map.put(scan_num, result_entry); + } else if (xcorr > last_result.scond_xcorr) { + ResultEntry result_entry = new ResultEntry(last_result.abs_ppm, last_result.xcorr, xcorr, last_result.cl_index); + result_map.put(scan_num, result_entry); + } + } else { + float total_mass = back1000(mass1000_1 + mass1000_2) + (float) linker_mass; + float abs_ppm = (float) (Math.abs(spectrum_entry.precursor_mass - total_mass) * 1e6 / total_mass); + String cl_index = semi_psm.chain_index + "-" + chain_entry_2.chain_index + "-" + semi_psm.link_site + "-" + link_site_2; + ResultEntry result_entry = new ResultEntry(abs_ppm, xcorr, 0, cl_index); + result_map.put(scan_num, result_entry); + } + } + } + } + } + } + } + + // Build a fix modification only map for further use. + Map fix_mod_only = new HashMap<>(); + Set aa_set = fix_mod_map.keySet(); + for (String aa : aa_set) { + double mod_mass = fix_mod_map.get(aa); + if (mod_mass != 0.0) { + fix_mod_only.put(aa, mod_mass); + } + } + + List search_result = new LinkedList<>(); + Set spectrum_num_set = result_map.keySet(); + for (int spectrum_num : spectrum_num_set) { + int rank = 1; + ResultEntry result_entry = result_map.get(spectrum_num); + String cl_index = result_entry.cl_index; + String[] temp = cl_index.split("-"); + String chain_index_1 = temp[0] + "-" + temp[1]; + String var_mod_1 = temp[1]; + String chain_index_2 = temp[2] + "-" + temp[3]; + String var_mod_2 = temp[3]; + String alpha_chain = index_chain_map.get(chain_index_1); + String beta_chain = index_chain_map.get(chain_index_2); + ChainEntry chain_entry_1 = chain_entry_map.get(alpha_chain); + ChainEntry chain_entry_2 = chain_entry_map.get(beta_chain); + String type = chain_entry_1.chain_type + chain_entry_2.chain_type; // 11 means target, 00, 10, 01 are decoy + + // Add fix modification annotation string. + Set fix_mod_aa = fix_mod_only.keySet(); + String fix_mod_1 = ""; + String fix_mod_2 = ""; + for (String aa : fix_mod_aa) { + if (alpha_chain.contains(aa)) { + fix_mod_1 += fix_mod_only.get(aa) + "@" + aa + ";"; + } + if (beta_chain.contains(aa)) { + fix_mod_2 += fix_mod_only.get(aa) + "@" + aa + ";"; + } + } + if (fix_mod_aa.contains("Z")) { + fix_mod_1 += fix_mod_only.get("Z") + "@Z;"; + fix_mod_2 += fix_mod_only.get("Z") + "@Z;"; + } + + String mod_1 = fix_mod_1 + var_mod_1; + String mod_2 = fix_mod_2 + var_mod_2; + + ScanHeader scan_header = msxml_parser.rap(spectrum_num).getHeader(); + + int precursor_charge = scan_header.getPrecursorCharge(); + + String pro_1 = chain_entry_1.pro_id; + if (chain_entry_1.chain_type.contentEquals("0")) { + pro_1 = chain_entry_1.pro_id.substring(6); + } + + String pro_2 = chain_entry_2.pro_id; + if (chain_entry_2.chain_type.contentEquals("0")) { + pro_2 = chain_entry_2.pro_id.substring(6); + } + + String cl_type = "intra_protein"; + if (!pro_1.contentEquals(pro_2)) { + cl_type = "inter_protein"; + } + + double delta_xcorr = result_entry.scond_xcorr / result_entry.xcorr; + + FinalResultEntry re = new FinalResultEntry(spectrum_num, cl_index, rank, precursor_charge, scan_header.getPrecursorMz(), result_entry.abs_ppm, result_entry.xcorr, delta_xcorr, alpha_chain, mod_1, chain_entry_1.pro_id, beta_chain, mod_2, chain_entry_2.pro_id, cl_type, type, -1); + search_result.add(re); + } + + Calendar cal_search_end = Calendar.getInstance(); + double time_search_end = System.nanoTime(); + spectra_duration = (time_search_end - time_search_start) * 1e-9; + + System.out.println(); + System.out.println("Duration: " + (int) spectra_duration + " seconds"); + output_str += "Duration: " + (int) spectra_duration + " seconds" + "\r\n"; + + log_entry.output_str = output_str; + + return search_result; + } + + ///////////////////////////////////////private methods////////////////////////////////////////////////////////////// + private void linearSearch(NavigableMap> sub_spectra_map, int mass1000_1, String chain_index, double[][] seq_ion, List link_site_list_1, TreeMap> exp_mass1000_2_semiPSM_map) { + Set exp_mass1000_set = sub_spectra_map.keySet(); + for (int exp_mass1000 : exp_mass1000_set) { + // Get the MS1 mass range. + int mass1000_2_left = 0; + int mass1000_2_right = 0; + if (ms1_tolerance_unit == 0) { + mass1000_2_left = exp_mass1000 - round1000(ms1_tolerance) - mass1000_1 - linker_mass1000; + mass1000_2_right = exp_mass1000 + round1000(ms1_tolerance) - mass1000_1 - linker_mass1000; + } else if (ms1_tolerance_unit == 1) { + mass1000_2_left = (int) (exp_mass1000 / (1 + ms1_tolerance / 1e6f)) - mass1000_1 - linker_mass1000; + mass1000_2_right = (int) (exp_mass1000 / (1 - ms1_tolerance / 1e6f)) - mass1000_1 - linker_mass1000; + } + + NavigableMap> sub_map = mass1000_chain_map.subMap(mass1000_2_left, true, mass1000_2_right, true); + if (sub_map.size() == 0) { + continue; + } + + for (int link_site_1 : link_site_list_1) { + double[][] pseudo_ion_array = mass_tool.buildPseudoCLIonArray(seq_ion, link_site_1, common_ion_charge, xlink_ion_charge, back1000(exp_mass1000 - mass1000_1)); + List spectrum_list = sub_spectra_map.get(exp_mass1000); + Map charge_mz_map = new HashMap<>(); + for (SpectrumEntry spectrum_entry : spectrum_list) { + int precursor_charge = spectrum_entry.precursor_charge; + double[] theo_mz; + if (charge_mz_map.containsKey(precursor_charge)) { + theo_mz = charge_mz_map.get(precursor_charge); + } else { + theo_mz = mass_tool.buildVector(pseudo_ion_array, precursor_charge, common_ion_charge[0]); + charge_mz_map.put(precursor_charge, theo_mz); + } + + // Calculate dot produce + CalScore cal_score = new CalScore(spectrum_entry.mz_intensity_array, theo_mz, ms2_tolerance); + double dot_product = cal_score.cal_dot_product(); + + // Record result + if (dot_product > single_dot_product_t) { + int exp_mass1000_2 = exp_mass1000 - mass1000_1 - linker_mass1000; + if (exp_mass1000_2_semiPSM_map.containsKey(exp_mass1000_2)) { + List temp = exp_mass1000_2_semiPSM_map.get(exp_mass1000_2); + temp.add(new SemiPSM(spectrum_entry.scan_num, dot_product, theo_mz.length, link_site_1, chain_index)); + exp_mass1000_2_semiPSM_map.put(exp_mass1000_2, temp); + } else { + List temp = new LinkedList<>(); + temp.add(new SemiPSM(spectrum_entry.scan_num, dot_product, theo_mz.length, link_site_1, chain_index)); + exp_mass1000_2_semiPSM_map.put(exp_mass1000_2, temp); + } + } + } + } + } + } + + private int[] string2Array(String str) { + String[] temp = str.split(","); + int[] output = new int[temp.length]; + for (int i = 0; i < output.length; ++i) { + output[i] = Integer.valueOf(temp[i]); + } + + return output; + } + + private int round1000(float a) { + return (int) (a * 1000); + } + + private float back1000(int a) { + return (float) a / (float) 1000; + } +} \ No newline at end of file diff --git a/src/main/java/proteomics/Search/SemiPSM.java b/src/main/java/proteomics/Search/SemiPSM.java new file mode 100644 index 0000000..414396b --- /dev/null +++ b/src/main/java/proteomics/Search/SemiPSM.java @@ -0,0 +1,19 @@ +package proteomics.Search; + + +public class SemiPSM { + public int scan_num = 0; + public double dot_product = 0; + public int ion_num = 0; + public int link_site = 0; + public String chain_index = ""; + + public SemiPSM(int scan_num, double dot_product, int ion_num, int link_site, String chain_index) { + this.scan_num = scan_num; + this.dot_product = dot_product; + this.ion_num = ion_num; + this.link_site = link_site; + this.chain_index = chain_index; + } +} + diff --git a/src/main/java/proteomics/Search/SpectrumEntry.java b/src/main/java/proteomics/Search/SpectrumEntry.java new file mode 100644 index 0000000..4774939 --- /dev/null +++ b/src/main/java/proteomics/Search/SpectrumEntry.java @@ -0,0 +1,19 @@ +package proteomics.Search; + +public class SpectrumEntry { + public int scan_num = 0; + public float precursor_intensity = 0; + public float precursor_mz = 0; + public float precursor_mass = 0; + public int precursor_charge = 0; + public double[][] mz_intensity_array = null; + + public SpectrumEntry(int scan_num, float precursor_intensity, float precursor_mz, float precursor_mass, int precursor_charge, double[][] mz_intensity_array) { + this.scan_num = scan_num; + this.precursor_intensity = precursor_intensity; + this.precursor_mz = precursor_mz; + this.precursor_mass = precursor_mass; + this.precursor_charge = precursor_charge; + this.mz_intensity_array = mz_intensity_array; + } +} \ No newline at end of file diff --git a/src/main/java/proteomics/SearchMain.java b/src/main/java/proteomics/SearchMain.java new file mode 100644 index 0000000..2c3909a --- /dev/null +++ b/src/main/java/proteomics/SearchMain.java @@ -0,0 +1,143 @@ +package proteomics; + +import proteomics.Parameter.Parameter; +import proteomics.Search.FinalResultEntry; +import proteomics.Search.PrepareSearch; +import proteomics.Search.Search; + +import java.io.BufferedWriter; +import java.io.FileWriter; +import java.io.IOException; +import java.text.SimpleDateFormat; +import java.util.*; + +public class SearchMain { + + public static void main(String[] args) throws Exception { + // Get current time + SimpleDateFormat sdf = new SimpleDateFormat("yyyy.MM.dd 'at' HH:mm:ss Z"); + Calendar cal_start = Calendar.getInstance(); + Date date_start = cal_start.getTime(); + double time_start = System.nanoTime(); + + // Process inputs + if (args.length != 2) { + help(); + } + + // Set parameters + String parameter_path = args[0].trim(); + String msxml_path = args[1].trim(); + + // Get the parameter map + Parameter parameter = new Parameter(parameter_path); + Map parameter_map = parameter.returnParameterMap(); + + // Prepare search + PrepareSearch ps = new PrepareSearch(parameter_map); + + // Searching... + LogEntry log_entry = new LogEntry(""); + Search search = new Search(ps, log_entry); + List search_results = search.doSearch(msxml_path); + + if (search_results.isEmpty()) { + System.out.println("There is no PSM."); + } else { + // save result + List> picked_result = pickResult(search_results); + System.out.println("Saving results..."); + saveResult2CSV(picked_result.get(0), picked_result.get(1), msxml_path); + } + + // Get end time + Calendar cal_end = Calendar.getInstance(); + Date date_end = cal_end.getTime(); + double time_end = System.nanoTime(); + + double duration = (time_end - time_start) * 1e-9; + + try (BufferedWriter target_writer = new BufferedWriter(new FileWriter(msxml_path + ".log.txt"))) { + target_writer.write(msxml_path + " finished.\r\n" + + "Started on: " + sdf.format(date_start) + "\r\n" + + "Ended on: " + sdf.format(date_end) + "\r\n" + + "Duration: " + (int) duration + " second" + "\r\n" + + "\r\n" + + "Log: \r\n" + + log_entry.output_str + "\r\n"); + } catch (IOException ex) { + System.err.println(ex.getMessage()); + } + + System.out.println("Done."); + } + + //////////////////////////////////////////private methods/////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + private static void saveResult2CSV(List search_results_intra, List search_results_inter, String id_file_name) throws Exception { + try (BufferedWriter intra_writer = new BufferedWriter(new FileWriter(id_file_name + ".intra.xls"))) { + intra_writer.write("id\tlabel\tscannr\txcorr\tdeltaXcorr\tabsppm\tpeptide\tproteinId1\n"); + for (FinalResultEntry re : search_results_intra) { + String[] temp = re.cl_index.split("-"); + int link_site_1 = Integer.valueOf(temp[4]) + 1; + int link_site_2 = Integer.valueOf(temp[5]) + 1; + if (re.type.contentEquals("11")) { + intra_writer.write(re.spectrum_id + "." + re.charge + "\t" + "1" + "\t" + re.spectrum_id + "\t" + re.xcorr + "\t" + re.delta_xcorr + "\t" + re.abs_ppm + "\t" + "-." + re.sequence_1 + "-" + link_site_1 + "-" + re.sequence_2 + "-" + link_site_2 + ".-" + "\t" + re.protein_id_1 + "-" + re.protein_id_2 + "\n"); + } else { + intra_writer.write(re.spectrum_id + "." + re.charge + "\t" + "-1" + "\t" + re.spectrum_id + "\t" + re.xcorr + "\t" + re.delta_xcorr + "\t" + re.abs_ppm + "\t" + "-." + re.sequence_1 + "-" + link_site_1 + "-" + re.sequence_2 + "-" + link_site_2 + ".-" + "\t" + re.protein_id_1 + "-" + re.protein_id_2 + "\n"); + } + } + } catch (IOException ex) { + System.err.println("IOException: " + ex.getMessage()); + System.exit(1); + } + + try (BufferedWriter inter_writer = new BufferedWriter(new FileWriter(id_file_name + ".inter.xls"))) { + inter_writer.write("id\tlabel\tscannr\txcorr\tdeltaXcorr\tabsppm\tpeptide\tproteinId1\n"); + for (FinalResultEntry re : search_results_inter) { + String[] temp = re.cl_index.split("-"); + int link_site_1 = Integer.valueOf(temp[4]) + 1; + int link_site_2 = Integer.valueOf(temp[5]) + 1; + if (re.type.contentEquals("11")) { + inter_writer.write(re.spectrum_id + "." + re.charge + "\t" + "1" + "\t" + re.spectrum_id + "\t" + re.xcorr + "\t" + re.delta_xcorr + "\t" + re.abs_ppm + "\t" + "-." + re.sequence_1 + "-" + link_site_1 + "-" + re.sequence_2 + "-" + link_site_2 + ".-" + "\t" + re.protein_id_1 + "-" + re.protein_id_2 + "\n"); + } else { + inter_writer.write(re.spectrum_id + "." + re.charge + "\t" + "-1" + "\t" + re.spectrum_id + "\t" + re.xcorr + "\t" + re.delta_xcorr + "\t" + re.abs_ppm + "\t" + "-." + re.sequence_1 + "-" + link_site_1 + "-" + re.sequence_2 + "-" + link_site_2 + ".-" + "\t" + re.protein_id_1 + "-" + re.protein_id_2 + "\n"); + } + } + } catch (IOException ex) { + System.err.println("IOException: " + ex.getMessage()); + System.exit(1); + } + } + + private static List> pickResult(List search_result) { + List> picked_result = new LinkedList<>(); + List inter_protein_result = new LinkedList<>(); + List intra_protein_result = new LinkedList<>(); + + for (FinalResultEntry result_entry : search_result) { + if (result_entry.cl_type.contentEquals("intra_protein")) { + intra_protein_result.add(result_entry); + } else { + inter_protein_result.add(result_entry); + } + } + + picked_result.add(intra_protein_result); + picked_result.add(inter_protein_result); + + return picked_result; + } + + private static void help() { + String help_str = "ECL version 20151011\r\n" + + "A cross-linked peptides identification tool." + + "Author: Fengchao Yu\r\n" + + "Email: fyuab@connect.ust.hk\r\n" + + "ECL usage: java -Xmx32g -jar /path/to/ECL.jar \r\n" + + "\t: parameter file. Can be download along with ECL.\r\n" + + "\t: spectra data file (mzXML)\r\n" + + "\texample: java -Xmx32g -jar ECL.jar parameter.def data.mzxml"; + System.out.print(help_str); + System.exit(1); + } +} diff --git a/src/main/java/proteomics/Spectrum/PreSpectrum.java b/src/main/java/proteomics/Spectrum/PreSpectrum.java new file mode 100644 index 0000000..83b12cb --- /dev/null +++ b/src/main/java/proteomics/Spectrum/PreSpectrum.java @@ -0,0 +1,181 @@ +package proteomics.Spectrum; + +import java.util.*; + +public class PreSpectrum { + + private static final double PROTON_MASS = 1.00727646688; + private static final double DEFAULT_INTENSITY = 1; // DO NOT change. Otherwise, change CalScore accordingly. + private static final Double DOUBLE_ZERO = 1e-6; + + public static double[][] preProcessSpec(double[][] peaks_array, float precursor_mass, int precursor_charge, double ms2_tolerance, double max_mz) { + // remove precursor peak from spectrum + TreeMap temp = removePrecursorPeak(peaks_array, precursor_mass, precursor_charge, ms2_tolerance); + + // reduce noise + TreeMap denoised_pl_map = deNoise(new TreeMap<>(temp.subMap(0d, max_mz))); + + // normalize + TreeMap normzlized_pl_map = normalizeSpec(denoised_pl_map); + + return prepareXcorr(normzlized_pl_map); + } + + private static TreeMap removePrecursorPeak(double[][] peak_array, float precursor_mass, int precursor_charge, double ms2_tolerance) { + TreeMap mz_intensity_map = new TreeMap<>(); + + for (int i = 0; i < peak_array[0].length; ++i) { + for (int charge = precursor_charge; charge > 0; --charge) { + float mz = (float) (precursor_mass + charge * PROTON_MASS) / charge; + if ((peak_array[1][i] > DOUBLE_ZERO) && (Math.abs(peak_array[0][i] - mz) > ms2_tolerance)) { + mz_intensity_map.put(peak_array[0][i], peak_array[1][i]); + } + } + } + + return mz_intensity_map; + } + +// private static TreeMap deNoise(SortedMap pl_map) { +// TreeMap denoised_pl_map = new TreeMap<>(); +// for (double mz : pl_map.keySet()) { +// double left_mz = Math.max(pl_map.firstKey(), mz - 25); +// double right_mz = Math.min(pl_map.lastKey(), mz + 25); +// SortedMap sub_map = pl_map.subMap(left_mz, right_mz); +// Double[] intensity_array = sub_map.values().toArray(new Double[sub_map.size()]); +// if (intensity_array.length > 6) { +// Arrays.sort(intensity_array); +// double threshold = intensity_array[intensity_array.length - 6]; +// if (pl_map.get(mz) > threshold) { +// denoised_pl_map.put(mz, pl_map.get(mz)); +// } +// } else { +// denoised_pl_map.put(mz, pl_map.get(mz)); +// } +// } +// +// return denoised_pl_map; +// } + + private static TreeMap deNoise(TreeMap pl_map) { + TreeMap denoised_pl_map = new TreeMap<>(); + double window_size = (pl_map.lastKey() - pl_map.firstKey()) / 10 + 1; + for (int i = 0; i < 10; ++i) { + double left_mz = pl_map.firstKey() + i * window_size; + double right_mz = Math.min(left_mz + window_size, pl_map.lastKey()); + NavigableMap sub_pl_map = pl_map.subMap(left_mz, true, right_mz, true); + if (sub_pl_map.size() > 4) { + double noise_intensity = estimateNoiseIntensity(sub_pl_map); + for (double mz : sub_pl_map.keySet()) { + if (sub_pl_map.get(mz) > noise_intensity) { + denoised_pl_map.put(mz, sub_pl_map.get(mz)); + } + } + } else { + for (double mz : sub_pl_map.keySet()) { + denoised_pl_map.put(mz, sub_pl_map.get(mz)); + } + } + } + return denoised_pl_map; + } + + private static double estimateNoiseIntensity(Map pl) { + Set intensity_set = new HashSet<>(pl.values()); + Double[] unique_intensity_vector = intensity_set.toArray(new Double[intensity_set.size()]); + Arrays.sort(unique_intensity_vector); + double[] cum = new double[unique_intensity_vector.length]; + for (int i = 0; i < unique_intensity_vector.length; ++i) { + for (double intensity : pl.values()) { + if (intensity <= unique_intensity_vector[i]) { + ++cum[i]; + } + } + } + double[][] diff = new double[2][unique_intensity_vector.length - 1]; + for (int i = 0; i < unique_intensity_vector.length - 1; ++i) { + diff[0][i] = cum[i + 1] - cum[i]; + diff[1][i] = unique_intensity_vector[i + 1] - unique_intensity_vector[i]; + } + double[] diff_2 = new double[unique_intensity_vector.length - 1]; + for (int i = 0; i < unique_intensity_vector.length - 1; ++i) { + diff_2[i] = diff[0][i] / (diff[1][i] + DOUBLE_ZERO); + } + double max_value = 0; + int max_idx = 0; + for (int i = 0; i < unique_intensity_vector.length - 1; ++i) { + if (diff_2[i] > max_value) { + max_value = diff_2[i]; + max_idx = i; + } + } + + return unique_intensity_vector[max_idx]; // TODO: improve + } + + private static TreeMap normalizeSpec(TreeMap pl_map) { + // sqrt the intensity and find the highest intensity. + TreeMap sqrt_pl_map = new TreeMap<>(); + double highest_intensity = 0; + Set mz_set = pl_map.keySet(); + for (double mz : mz_set) { + if (pl_map.get(mz) > DOUBLE_ZERO) { + double sqrt_intensity = Math.sqrt(pl_map.get(mz)); + if (sqrt_intensity > highest_intensity) { + highest_intensity = sqrt_intensity; + } + sqrt_pl_map.put(mz, sqrt_intensity); + } + } + + // normalize the highest intensity to DEFAULT_INTENSITY + double factor = DEFAULT_INTENSITY / highest_intensity; + mz_set = sqrt_pl_map.keySet(); + for (double mz : mz_set) { + sqrt_pl_map.put(mz, sqrt_pl_map.get(mz) * factor); + } + + // divide the spectrum into 10 windows and normalize each windows to DEFAULT_INTENSITY + TreeMap windowed_pl_map = new TreeMap<>(); + double window_size = (sqrt_pl_map.lastKey() - sqrt_pl_map.firstKey()) / 10 + 1; + for (int i = 0; i < 10; ++i) { + // find the max intensity in each window + double left_mz = sqrt_pl_map.firstKey() + i * window_size; + double right_mz = Math.min(left_mz + window_size, sqrt_pl_map.lastKey()); + NavigableMap sub_map = sqrt_pl_map.subMap(left_mz, true, right_mz, true); + if (!sub_map.isEmpty()) { + Double[] intensity_array = sub_map.values().toArray(new Double[sub_map.size()]); + double temp_1 = DEFAULT_INTENSITY / intensity_array[intensity_array.length - 1]; + double temp_2 = 0.05 * intensity_array[intensity_array.length - 1]; + for (double mz : sub_map.keySet()) { + if (sub_map.get(mz) > temp_2) { + windowed_pl_map.put(mz, sub_map.get(mz) * temp_1); + } + } + } + } + + return windowed_pl_map; + } + + private static double[][] prepareXcorr(TreeMap mz_intensity_map) { + double[][] output = new double[2][mz_intensity_map.size()]; + + Collection intensity_list = mz_intensity_map.values(); + double temp = 0; + for (double intensity : intensity_list) { + temp += intensity * intensity; + } + temp = Math.sqrt(temp); + + Set mz_set = mz_intensity_map.keySet(); + int i = 0; + for (double mz : mz_set) { + output[0][i] = mz; + output[1][i] = mz_intensity_map.get(mz) / temp; + ++i; + } + + return output; + } +} diff --git a/src/main/java/proteomics/Validation/CalFDR.java b/src/main/java/proteomics/Validation/CalFDR.java new file mode 100644 index 0000000..65dc481 --- /dev/null +++ b/src/main/java/proteomics/Validation/CalFDR.java @@ -0,0 +1,84 @@ +package proteomics.Validation; + +import java.util.*; +import proteomics.Search.*; + +public class CalFDR { + + private static final float MAX_SCORE = 1; + private static final float PRECISION = 0.01f; + + private double[] qvalue_array = null; + private List results = null; + + public CalFDR(List results) { + this.results = results; + + final int array_length = 1 + (int) Math.ceil(MAX_SCORE / PRECISION); + double[] decoy_score_vector = new double[array_length]; + double[] target_score_vector = new double[array_length]; + double[] fuse_score_vector = new double[array_length]; + double[] fdr_array = new double[array_length]; + qvalue_array = new double[array_length]; + + for (FinalResultEntry re : results) { + if (re.type.contentEquals("00")) { + int idx = (int) Math.floor(re.xcorr / PRECISION); + ++decoy_score_vector[idx]; + } else if (re.type.contentEquals("11")) { + int idx = (int) Math.floor(re.xcorr / PRECISION); + ++target_score_vector[idx]; + } else { + int idx = (int) Math.floor(re.xcorr / PRECISION); + ++fuse_score_vector[idx]; + } + } + + // Calculate FDR + for (int idx_1 = 0; idx_1 < array_length; ++idx_1) { + int decoy_count = 0; + int fuse_count = 0; + int target_count = 0; + for (int idx_2 = idx_1; idx_2 < array_length; ++idx_2) { + decoy_count += decoy_score_vector[idx_2]; + target_count += target_score_vector[idx_2]; + fuse_count += fuse_score_vector[idx_2]; + } + + double fdr; + if (fuse_count < decoy_count) { + fdr = (double) decoy_count / (double) target_count; + } else { + fdr = (double) (fuse_count - decoy_count) / (double) target_count; + } + + fdr = Math.min(fdr, 1); // Adjust those fdrs that are larger than 1 + fdr_array[idx_1] = fdr; + } + + // Convert FDR to qvalue + double last_q_value = fdr_array[0]; + qvalue_array[0] = last_q_value; + + for (int idx_1 = 1; idx_1 < array_length; ++idx_1) { + double q_value = fdr_array[idx_1]; + if (q_value >= last_q_value) { + qvalue_array[idx_1] = last_q_value; + } else { + qvalue_array[idx_1] = q_value; + last_q_value = q_value; + } + } + } + + public List includeStats() { + for (FinalResultEntry re : results) { + if (re.type.contentEquals("11")) { + int idx = (int) Math.floor(re.xcorr / PRECISION); + re.qvalue = qvalue_array[idx]; + } + } + + return results; + } +} \ No newline at end of file diff --git a/src/main/java/proteomics/Validation/GenerateDecoySeq.java b/src/main/java/proteomics/Validation/GenerateDecoySeq.java new file mode 100644 index 0000000..7d34b49 --- /dev/null +++ b/src/main/java/proteomics/Validation/GenerateDecoySeq.java @@ -0,0 +1,97 @@ +package proteomics.Validation; + +import java.util.*; +import java.util.regex.*; + +public class GenerateDecoySeq { + + private String type = null; + private Pattern fix_pattern = null; + + public GenerateDecoySeq(String type) { + this.type = type; + + fix_pattern = Pattern.compile("[KR]"); + } + + public String generateSeq(String sequence) { + String decoy_seq = new String(); + if (type.contentEquals("shuffle")) { + decoy_seq = shuffleSeq(sequence); + } else if (type.contentEquals("reverse")) { + decoy_seq = reverseSeq(sequence); + } + + return decoy_seq; + } + + /** + * Shuffle a sequnce based on Fisher-Yates shuffle method + * + * @param sequence + * @return + */ + private String shuffleSeq(String sequence) { + String decoy_str = new String(); + Random rnd = new Random(); + Matcher fix_matcher = fix_pattern.matcher(sequence); + int sequence_length = sequence.length(); + int idx_1 = 0; + int idx_2 = 0; + while (idx_1 < sequence_length) { + String fix_aa = new String(); + if (fix_matcher.find()) { + idx_2 = fix_matcher.start(); + fix_aa = sequence.substring(idx_2, idx_2 + 1); + } else { + idx_2 = sequence_length; + fix_aa = ""; + } + String part = sequence.substring(idx_1, idx_2); + + // Fisher-Yates shuffle method + char[] part_array = part.toCharArray(); + int array_num = part_array.length; + for (int idx_3 = array_num - 1; idx_3 > 0; --idx_3) { + int idx_4 = rnd.nextInt(idx_3 + 1); + char temp = part_array[idx_4]; + part_array[idx_4] = part_array[idx_3]; + part_array[idx_3] = temp; + } + + // Put shuffled string to the result sequence. + decoy_str += new String(part_array) + fix_aa; + + idx_1 = idx_2 + 1; + } + + return decoy_str; + } + + private String reverseSeq(String sequence) { + String decoy_str = new String(); + Matcher fix_matcher = fix_pattern.matcher(sequence); + int sequence_length = sequence.length(); + int idx_1 = 0; + int idx_2 = 0; + while (idx_1 < sequence_length) { + String fix_aa = new String(); + if (fix_matcher.find()) { + idx_2 = fix_matcher.start(); + fix_aa = sequence.substring(idx_2, idx_2 + 1); + } else { + idx_2 = sequence_length; + fix_aa = ""; + } + String part = sequence.substring(idx_1, idx_2); + + // Reverse part sequence + StringBuilder temp = new StringBuilder(part); + StringBuilder reverse = temp.reverse(); + decoy_str += reverse.toString() + fix_aa; + idx_1 = idx_2 + 1; + } + + return decoy_str; + } +} diff --git a/src/main/java/theoSeq/DbTool.java b/src/main/java/theoSeq/DbTool.java new file mode 100644 index 0000000..ed79a63 --- /dev/null +++ b/src/main/java/theoSeq/DbTool.java @@ -0,0 +1,61 @@ +package theoSeq; + +import java.io.*; +import java.util.*; +import java.util.regex.*; + +public class DbTool { + + private Map pro_seq_map = new HashMap<>(); + private Map pro_annotate_map = new HashMap<>(); + + public DbTool(String db_name) throws Exception { + String id = new String(); + String annotate = new String(); + String seq = new String(); + + boolean new_pro = true; + + Pattern header_pattern = Pattern.compile(">([^\\s]*)(.*)"); + + try (BufferedReader db_reader = new BufferedReader(new FileReader(db_name))) { + String line = new String(); + while ((line = db_reader.readLine()) != null) { + line = line.trim(); + Matcher head_matcher = header_pattern.matcher(line); + if (head_matcher.matches()) { + // This line is a header + if (new_pro == false) { + // This isn't the first protein + pro_seq_map.put(id, seq); + } + id = head_matcher.group(1); + annotate = head_matcher.group(2); + pro_annotate_map.put(id, annotate); + new_pro = true; + } else if (line.isEmpty() == false) { + // This line is a body + if (new_pro == true) { + seq = line; + new_pro = false; + } else { + seq += line; + } + } + } + // Last protein + pro_seq_map.put(id, seq); + } catch (IOException | PatternSyntaxException ex) { + System.err.println("IOException: " + ex.getMessage()); + System.exit(1); + } + } + + public Map returnSeqMap() { + return pro_seq_map; + } + + public Map returnAnnotateMap() { + return pro_annotate_map; + } +} diff --git a/src/main/java/theoSeq/MassTool.java b/src/main/java/theoSeq/MassTool.java new file mode 100644 index 0000000..881f496 --- /dev/null +++ b/src/main/java/theoSeq/MassTool.java @@ -0,0 +1,243 @@ +package theoSeq; + +import java.util.*; +import java.util.regex.*; + +public class MassTool { + + private static final double DOUBLE_ZERO = 1e-6; + private static final double PROTON_MASS = 1.00727646688; + + private final Map mass_table = new HashMap<>(); + private final int missed_cleavage; + private final int nterm_linkable; + + public MassTool(final int missed_cleavage, Map fix_mod_map, int nterm_linkable) { + this.missed_cleavage = missed_cleavage; + this.nterm_linkable = nterm_linkable; + mass_table.put("G", 57.021464 + fix_mod_map.get("G")); + mass_table.put("A", 71.037114 + fix_mod_map.get("A")); + mass_table.put("S", 87.032028 + fix_mod_map.get("S")); + mass_table.put("P", 97.052764 + fix_mod_map.get("P")); + mass_table.put("V", 99.068414 + fix_mod_map.get("V")); + mass_table.put("T", 101.047678 + fix_mod_map.get("I")); + mass_table.put("C", 103.009184 + fix_mod_map.get("C")); + mass_table.put("I", 113.084064 + fix_mod_map.get("I")); + mass_table.put("L", 113.084064 + fix_mod_map.get("L")); + mass_table.put("N", 114.042927 + fix_mod_map.get("N")); + mass_table.put("D", 115.026943 + fix_mod_map.get("D")); + mass_table.put("Q", 128.058578 + fix_mod_map.get("Q")); + mass_table.put("K", 128.094963 + fix_mod_map.get("K")); + mass_table.put("E", 129.042593 + fix_mod_map.get("E")); + mass_table.put("M", 131.040485 + fix_mod_map.get("M")); + mass_table.put("H", 137.058912 + fix_mod_map.get("H")); + mass_table.put("F", 147.068414 + fix_mod_map.get("F")); + mass_table.put("R", 156.101111 + fix_mod_map.get("R")); + mass_table.put("Y", 163.063329 + fix_mod_map.get("Y")); + mass_table.put("W", 186.079313 + fix_mod_map.get("W")); + mass_table.put("X", 118.805717); + mass_table.put("Z", fix_mod_map.get("Z")); + mass_table.put("Hatom", 1.007825032); + mass_table.put("Natom", 14.00307401); + mass_table.put("Oatom", 15.99491462); + mass_table.put("Patom", 30.97376151); + mass_table.put("Satom", 31.97207069); + } + + public double calResidueMass(String seq) { // Caution: nterm mod is not included!!!!! + double total_mass = 0; + int length = seq.length(); + for (int idx = 0; idx < length; ++idx) { + total_mass += mass_table.get(seq.substring(idx, idx + 1)); + } + + return total_mass; + } + + public Set buildChainSet(String pro_seq) { + Map> digest_range_map = digestTrypsin(pro_seq); + Set chain_seq_set = new HashSet<>(); + + for (int i = 0; i <= missed_cleavage; ++i) { + for (int[] digest_range_1 : digest_range_map.get(i)) { + String sub_string = pro_seq.substring(digest_range_1[0], digest_range_1[1]); + if (sub_string.substring(0, sub_string.length() - 1).contains("K")) { + // If there is a K in middle, this peptide is a chain. + chain_seq_set.add(sub_string); + } + } + if (nterm_linkable == 1) { + if (digest_range_map.get(i).size() > 0) { + int[] digest_range = digest_range_map.get(i).get(0); + String sub_string = pro_seq.substring(digest_range[0], digest_range[1]); + chain_seq_set.add(sub_string); + } + } + } + return chain_seq_set; + } + + public double[][] buildChainIonArray(String pep_chain, int min_charge, int max_charge) { + // [NOTE] The b/y-ions charge 0 + int charge_num = max_charge - min_charge + 1; + double[][] chain_ion_array = new double[2 * charge_num][pep_chain.length()]; + double b_ion_mass = mass_table.get("Z"); + double y_ion_mass = calResidueMass(pep_chain) + mass_table.get("Z") + 2 * mass_table.get("Hatom") + mass_table.get("Oatom"); + + for (int charge = min_charge; charge <= max_charge; ++charge) { + double b_ion_mass_charge = (b_ion_mass + charge * PROTON_MASS) / charge; + double y_ion_mass_charge = (y_ion_mass + charge * PROTON_MASS) / charge; + int charge_idx = charge - min_charge; + int charge_idx_2 = 2 * charge_idx; + int charge_idx_2_1 = charge_idx_2 + 1; + + for (int idx = 0; idx < pep_chain.length(); ++idx) { + // y-ion + chain_ion_array[charge_idx_2_1][idx] = y_ion_mass_charge; + + String aa = pep_chain.substring(idx, idx + 1); + + // b-ion + b_ion_mass_charge += mass_table.get(aa) / charge; + chain_ion_array[charge_idx_2][idx] = b_ion_mass_charge; + + // Calculate next y-ion + if (idx == 0) { + y_ion_mass_charge -= (mass_table.get(aa) / charge + mass_table.get("Z") / charge); + } else { + y_ion_mass_charge -= mass_table.get(aa) / charge; + } + } + } + + return chain_ion_array; + } + + public Map returnMassTable() { + return mass_table; + } + + public double[] buildVector(double[][] ion_matrix, int precursor_charge, int min_charge) { + int col_num = ion_matrix[0].length; + + int max_row = Math.min(ion_matrix.length / 2, precursor_charge - min_charge) * 2; + double[] temp = new double[max_row * col_num]; + for (int i = 0; i < max_row; ++i) { + System.arraycopy(ion_matrix[i], 0, temp, i * col_num, col_num); + } + Arrays.sort(temp); + + int left_border = 0; + int right_border = temp.length; + boolean has_duplicate = false; + for (int i = 0; i < temp.length - 1; ++i) { + if (Math.abs(temp[i] - temp[i+1]) < DOUBLE_ZERO) { + temp[i] = 0; + ++left_border; + has_duplicate = true; + } + } + + if (has_duplicate) { + Arrays.sort(temp); + return Arrays.copyOfRange(temp, left_border, right_border); + } else { + return Arrays.copyOfRange(temp, left_border, right_border); + } + } + + public double[][] buildPseudoCLIonArray(double[][] seq_ion, int link_site, int[] common_ion_charge, int[] xlink_ion_charge, float additional_mass) { // TODO: check + int charge_num = xlink_ion_charge[xlink_ion_charge.length - 1] - common_ion_charge[0] + 1; + int col_num = seq_ion[0].length; + double[][] cl_ion_array = new double[2 * charge_num][col_num]; + + // Common ion only. + for (int charge = common_ion_charge[0]; charge < xlink_ion_charge[0]; ++charge) { + int charge_idx = charge - common_ion_charge[0]; + int charge_idx_2 = 2 * charge_idx; + int charge_idx_2_1 = charge_idx_2 + 1; + System.arraycopy(seq_ion[charge_idx_2], 0, cl_ion_array[charge_idx_2], 0, link_site); + System.arraycopy(seq_ion[charge_idx_2_1], link_site + 1, cl_ion_array[charge_idx_2_1], link_site + 1, col_num - link_site - 1); + } + + // Common and xlink ion + for (int charge = xlink_ion_charge[0]; charge <= common_ion_charge[common_ion_charge.length - 1]; ++charge) { + int charge_idx = charge - common_ion_charge[0]; // Caution! + int charge_idx_2 = 2 * charge_idx; + int charge_idx_2_1 = charge_idx_2 + 1; + System.arraycopy(seq_ion[charge_idx_2], 0, cl_ion_array[charge_idx_2], 0, link_site); + System.arraycopy(seq_ion[charge_idx_2_1], link_site + 1, cl_ion_array[charge_idx_2_1], link_site + 1, col_num - link_site - 1); + double addition_mz = additional_mass / charge; + for (int idx = 0; idx < col_num; ++idx) { + if (idx < link_site) { + cl_ion_array[charge_idx_2_1][idx] = seq_ion[charge_idx_2_1][idx] + addition_mz; + } else if (idx == link_site) { + cl_ion_array[charge_idx_2][idx] = seq_ion[charge_idx_2][idx] + addition_mz; + cl_ion_array[charge_idx_2_1][idx] = seq_ion[charge_idx_2_1][idx] + addition_mz; + } else { + cl_ion_array[charge_idx_2][idx] = seq_ion[charge_idx_2][idx] + addition_mz; + } + } + } + + // Xlink ion only + for (int charge = common_ion_charge[common_ion_charge.length - 1] + 1; charge <= xlink_ion_charge[xlink_ion_charge.length - 1]; ++charge) { + int charge_idx = charge - common_ion_charge[0]; // Caution! + int charge_idx_2 = 2 * charge_idx; + int charge_idx_2_1 = charge_idx_2 + 1; + + // Alpha chain + double addition_mz = additional_mass / charge; + for (int idx = 0; idx < col_num; ++idx) { + if (idx < link_site) { + cl_ion_array[charge_idx_2_1][idx] = seq_ion[charge_idx_2_1][idx] + addition_mz; + } else if (idx == link_site) { + cl_ion_array[charge_idx_2][idx] = seq_ion[charge_idx_2][idx] + addition_mz; + cl_ion_array[charge_idx_2_1][idx] = seq_ion[charge_idx_2_1][idx] + addition_mz; + } else { + cl_ion_array[charge_idx_2][idx] = seq_ion[charge_idx_2][idx] + addition_mz; + } + } + } + + return cl_ion_array; + } + + private Map> digestTrypsin(String pro_seq) { + // Cut a protein + List cut_point_list = new LinkedList<>(); + int length = pro_seq.length(); + Pattern rep_1 = Pattern.compile("(K|R)(?=[^P])"); + int idx_start = 0; + Matcher match_obj = rep_1.matcher(pro_seq); + cut_point_list.add(0); + while (idx_start < length) { + if (match_obj.find()) { + int cut_point = match_obj.end(); + cut_point_list.add(cut_point); + idx_start = cut_point; + } else { + cut_point_list.add(length); + break; + } + } + + Collections.sort(cut_point_list); + + // Deal with missed cleavage + Map> digest_range_map = new HashMap<>(); + for (int time = 0; time <= missed_cleavage; ++time) { + List temp = new LinkedList<>(); + int left_point = 0; + int right_point = 0; + for (int i = 0; i + 1 + time < cut_point_list.size(); ++i) { + left_point = cut_point_list.get(i); + right_point = cut_point_list.get(i + 1 + time); + temp.add(new int[]{left_point, right_point}); + } + digest_range_map.put(time, temp); + } + + return digest_range_map; + } +} diff --git a/src/main/resources/parameter.def b/src/main/resources/parameter.def new file mode 100644 index 0000000..14d8404 --- /dev/null +++ b/src/main/resources/parameter.def @@ -0,0 +1,49 @@ +# Database +db = D:\Cross_Linking_Data\Aebersold\db\26Syeast.fasta # The whole protein database +decoy_db = reverse # shuffle, reverse or decoy database file path +missed_cleavage = 2 # maximum number of missed cleavage +min_precursor_mass = 1000 +max_precursor_mass = 5000 +min_chain_length = 5 # minimum length of a peptide chain +max_chain_length = 50 + +# Spectrum +ms1_charge = 2,3,4,5 +common_ion_charge = 1,2 +xlink_ion_charge = 2,3 +min_peak_mz = 0 +min_peak_num = 20 + +# Tolerance +ms1_tolerance_unit = 1 # 0: Da; 1: ppm +ms1_tolerance = 10 +ms2_tolerance = 0.2 # unit: m/z + +# Cross-linking parameter. +# Cross-linking site can not be modified. +cl_aa = K +nterm_linkable = 1 +cl_mass = 138.0680796 + +# Fix modification +G = 0 +A = 0 +S = 0 +P = 0 +V = 0 +T = 0 +C = 57.02146 +I = 0 +L = 0 +N = 0 +D = 0 +Q = 0 +K = 0 +E = 0 +M = 0 +H = 0 +F = 0 +R = 0 +Y = 0 +W = 0 +Z = 0 # nterm mod \ No newline at end of file