Skip to content

Commit fd4d7fd

Browse files
committed
Address issue #67 by adding parameter "protein_modlist_file" which points to a file that specifies restricting variable mods to apply only to a subset of proteins. Contents of the text file pointed to by that parameter are modification number and an accession string, one per line. This functionality current only works with standard Comet; support for FI will be addressed next. Also simplify WriteVariableMod() in both CometWritePepXML and CometWriteMzIdentML.
1 parent b13807b commit fd4d7fd

9 files changed

+272
-246
lines changed

Comet.cpp

Lines changed: 15 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -366,6 +366,13 @@ void LoadParameters(char *pszParamsFile,
366366
sscanf(szParamVal, "%255s", szTxtFileExt);
367367
pSearchMgr->SetParam("text_file_extension", szTxtFileExt, szTxtFileExt);
368368
}
369+
else if (!strcmp(szParamName, "protein_modslist_file"))
370+
{
371+
char szTmp[256];
372+
szTmp[0] = '\0';
373+
sscanf(szParamVal, "%255s", szTmp);
374+
pSearchMgr->SetParam("protein_modslist_file", szTmp, szTmp);
375+
}
369376
else if (!strcmp(szParamName, "explicit_deltacn"))
370377
{
371378
sscanf(szParamVal, "%d", &iIntParam);
@@ -1758,8 +1765,14 @@ variable_mod15 = 0.0 X 0 3 -1 0 0 0.0\n");
17581765

17591766
fprintf(fp,
17601767
"max_variable_mods_in_peptide = 5\n\
1761-
require_variable_mod = 0\n\
1762-
\n\
1768+
require_variable_mod = 0\n");
1769+
if (iPrintParams == 2)
1770+
{
1771+
fprintf(fp, "protein_modslist_file = # limit variable mods to subset of specified proteins if this file is specified & present\n");
1772+
}
1773+
1774+
fprintf(fp,
1775+
"\n\
17631776
#\n\
17641777
# fragment ions\n\
17651778
#\n\

CometSearch/CometDataInternal.h

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,6 @@ struct Options
130130
int bClipNtermMet; // 0=leave protein sequences alone; 1=also consider w/o N-term methionine
131131
int bClipNtermAA; // 0=leave peptide sequences as-is; 1=clip N-term amino acid from every peptide
132132
int bSkipAlreadyDone; // 0=search everything; 1=don't re-search if .out exists
133-
// int bSkipUpdateCheck; // 0=do not check for updates; 1=check for updates
134133
int bMango; // 0=normal; 1=Mango x-link ms2 input
135134
int bScaleFragmentNL; // 0=no; 1=scale fragment NL for each modified residue contained in fragment
136135
int bCreateIndex; // 0=normal search; 1=create peptide index file
@@ -191,7 +190,6 @@ struct Options
191190
bClipNtermMet = a.bClipNtermMet;
192191
bClipNtermAA = a.bClipNtermAA;
193192
bSkipAlreadyDone = a.bSkipAlreadyDone;
194-
// bSkipUpdateCheck = a.bSkipUpdateCheck;
195193
bMango = a.bMango;
196194
bScaleFragmentNL = a.bScaleFragmentNL;
197195
bCreateIndex = a.bCreateIndex;
@@ -566,6 +564,8 @@ struct VarModParams
566564
int iMaxPermutations;
567565
VarMods varModList[VMODS];
568566
char cModCode[VMODS]; // mod characters
567+
string sProteinLModsListFile; // file containing list of proteins to restrict application of varmods to
568+
multimap<int, string> mmapProteinLModsList; // <varmod#, protein name> vector read from sProteinModsListFile if present
569569

570570
VarModParams& operator=(VarModParams& a)
571571
{
@@ -822,6 +822,14 @@ struct StaticParams
822822
variableModParameters.cModCode[6] = '%';
823823
variableModParameters.cModCode[7] = '!';
824824
variableModParameters.cModCode[8] = '+';
825+
for (int i = 9; i < VMODS; ++i)
826+
{
827+
int iAscii = 88 + i; //start with lower case 'a' ASCII 97
828+
if (iAscii <= 125) // thru '}' which is ASCII 125
829+
variableModParameters.cModCode[i] = (char)(iAscii);
830+
else
831+
variableModParameters.cModCode[i] = '_';
832+
}
825833

826834
variableModParameters.iMaxVarModPerPeptide = 5;
827835
variableModParameters.iMaxPermutations = MAX_PERMUTATIONS;
@@ -866,7 +874,6 @@ struct StaticParams
866874
options.bResolveFullPaths = 1;
867875

868876
options.bSkipAlreadyDone = 1;
869-
// options.bSkipUpdateCheck = 0;
870877
options.bMango = 0;
871878
options.bScaleFragmentNL = 0;
872879
options.bCreateIndex = 0;

CometSearch/CometSearch.cpp

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
#include "ModificationsPermuter.h"
2424

2525
#include <stdio.h>
26+
#include <string.h>
2627
#include <sstream>
2728
#include <bitset>
2829

@@ -1569,6 +1570,7 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe,
15691570
int iStartPos = 0;
15701571
int iEndPos = 0;
15711572
int piVarModCounts[VMODS];
1573+
bool pbVarModProteinFilter[VMODS]; // default true; set to false if a mod has a protein filter that does not match this protein
15721574
int iWhichIonSeries;
15731575
int ctIonSeries;
15741576
int ctLen;
@@ -1612,6 +1614,41 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe,
16121614
if (g_staticParams.options.bClipNtermAA) // skip the N-term residue of every peptide
16131615
iStartPos = 1;
16141616

1617+
for (int i = 0; i < VMODS; ++i)
1618+
{
1619+
piVarModCounts[i] = 0;
1620+
pbVarModProteinFilter[i] = true;
1621+
}
1622+
1623+
// If variable modifications protein filter is applied, check if current sequence
1624+
// is in the protein filter list. Check each variable mod protein filter and if
1625+
// current protein is not on the list, do not apply that particular variable mod.
1626+
// Any variable mod on the list will have pbVarModProteinFilter[?] = false unless
1627+
// the current protein matches a protein on the list.
1628+
if (g_staticParams.variableModParameters.mmapProteinLModsList.size() > 0)
1629+
{
1630+
char szProteinAccession[256];
1631+
sscanf(dbe.strName.c_str(), "%255s", szProteinAccession);
1632+
szProteinAccession[255] = '\0';
1633+
1634+
auto it = g_staticParams.variableModParameters.mmapProteinLModsList.begin();
1635+
while (it != g_staticParams.variableModParameters.mmapProteinLModsList.end())
1636+
{
1637+
int iWhichMod = it->first;
1638+
1639+
pbVarModProteinFilter[iWhichMod - 1] = false; // do not apply this mod to this protein unless it's on the mmapProteinModsList
1640+
1641+
while (it != g_staticParams.variableModParameters.mmapProteinLModsList.end() && it->first == iWhichMod)
1642+
{
1643+
if (strstr(szProteinAccession, it->second.c_str()))
1644+
{
1645+
pbVarModProteinFilter[iWhichMod - 1] = true;
1646+
}
1647+
it++;
1648+
}
1649+
}
1650+
}
1651+
16151652
// Quick clip n-term & PEFF variant check. Start summing amino acid mass at
16161653
// the start variant position and work backwards. If the mass is larger than
16171654
// the max mass or the peptide length is longer than the max length before we
@@ -2124,6 +2161,17 @@ bool CometSearch::SearchForPeptides(struct sDBEntry dbe,
21242161
// Otherwise, at this point, peptide mass is too big which means should be ok for varmod search.
21252162
if (!g_staticParams.options.bCreateIndex && HasVariableMod(piVarModCounts, iStartPos, iEndPos, &dbe))
21262163
{
2164+
// if variable mod protein filter applied, set residue mod count to 0 for the
2165+
// particular variable mod if current protein not on the protein filter list
2166+
if (g_staticParams.variableModParameters.mmapProteinLModsList.size() > 0)
2167+
{
2168+
for (int i = 0; i < VMODS; ++i)
2169+
{
2170+
if (pbVarModProteinFilter[i] == false)
2171+
piVarModCounts[i] = 0;
2172+
}
2173+
}
2174+
21272175
// VariableModSearch also includes looking at PEFF mods
21282176
VariableModSearch(szProteinSeq, piVarModCounts, iStartPos, iEndPos, pbDuplFragment, &dbe);
21292177
}

CometSearch/CometSearch.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ class CometSearch
206206
int *piVarModSites,
207207
struct sDBEntry *dbe);
208208
void VariableModSearch(char *szProteinSeq,
209-
int varModCounts[],
209+
int piVarModCounts[],
210210
int iStartPos,
211211
int iEndPos,
212212
bool *pbDuplFragment,

CometSearch/CometSearchManager.cpp

Lines changed: 52 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -624,7 +624,10 @@ bool CometSearchManager::InitializeStaticParams()
624624
{
625625
if (strData.length() > 0)
626626
strcpy(g_staticParams.szTxtFileExt, strData.c_str());
627-
}
627+
}
628+
629+
if (GetParamValue("protein_modslist_file", strData))
630+
g_staticParams.variableModParameters.sProteinLModsListFile = strData;
628631

629632
if (GetParamValue("peff_obo", strData))
630633
strcpy(g_staticParams.peffInfo.szPeffOBO, strData.c_str());
@@ -805,8 +808,6 @@ bool CometSearchManager::InitializeStaticParams()
805808

806809
GetParamValue("skip_researching", g_staticParams.options.bSkipAlreadyDone);
807810

808-
// GetParamValue("skip_updatecheck", g_staticParams.options.bSkipUpdateCheck);
809-
810811
GetParamValue("mango_search", g_staticParams.options.bMango);
811812

812813
GetParamValue("scale_fragmentNL", g_staticParams.options.bScaleFragmentNL);
@@ -1982,13 +1983,59 @@ bool CometSearchManager::DoSearch()
19821983
if (!g_staticParams.options.bOutputSqtStream) // && !g_staticParams.bIndexDb)
19831984
{
19841985
strOut = "\n Comet version \"" + g_sCometVersion + "\"\n\n";
1985-
// if (!g_staticParams.options.bSkipUpdateCheck)
1986-
// CometCheckForUpdates::CheckForUpdates(strOut.c_str());
19871986

19881987
logout(strOut.c_str());
19891988
fflush(stdout);
19901989
}
19911990

1991+
FILE* fp;
1992+
1993+
// see if comet_varmod_proteins.txt is present, if so, read in the list of masses.
1994+
if ((fp = fopen(g_staticParams.variableModParameters.sProteinLModsListFile.c_str(), "r")) != NULL)
1995+
{
1996+
char szBuf[512];
1997+
vector<pair<int, string>> vpTmp;
1998+
1999+
printf(" Protein variable modifications filter:\n");
2000+
2001+
while (fgets(szBuf, 512, fp))
2002+
{
2003+
if (strlen(szBuf) > 3)
2004+
{
2005+
char szProtein[512];
2006+
int iWhichMod;
2007+
2008+
if (sscanf(szBuf, "%d %s", &iWhichMod, szProtein) == 2)
2009+
{
2010+
if (iWhichMod > 0 && iWhichMod <= VMODS)
2011+
{
2012+
g_staticParams.variableModParameters.mmapProteinLModsList.insert({ iWhichMod, szProtein });
2013+
}
2014+
}
2015+
}
2016+
}
2017+
fclose(fp);
2018+
2019+
auto it = g_staticParams.variableModParameters.mmapProteinLModsList.begin();
2020+
while (it != g_staticParams.variableModParameters.mmapProteinLModsList.end())
2021+
{
2022+
int iWhichMod = it->first;
2023+
bool bFirst = true;
2024+
2025+
printf(" - variable_mod%02d: ", iWhichMod);
2026+
while (it != g_staticParams.variableModParameters.mmapProteinLModsList.end() && it->first == iWhichMod)
2027+
{
2028+
if (!bFirst)
2029+
printf(", ");
2030+
printf("%s", it->second.c_str());
2031+
it++;
2032+
bFirst = false;
2033+
}
2034+
printf("\n");
2035+
}
2036+
printf("\n");
2037+
}
2038+
19922039
g_staticParams.precalcMasses.iMinus17 = BIN(g_staticParams.massUtility.dH2O);
19932040
g_staticParams.precalcMasses.iMinus18 = BIN(g_staticParams.massUtility.dNH3);
19942041
g_massRange.dMinMass = g_staticParams.options.dPeptideMassLow;

0 commit comments

Comments
 (0)