Skip to content

Commit

Permalink
GPTWiki current release updates
Browse files Browse the repository at this point in the history
  • Loading branch information
edwardsnj committed Jun 23, 2021
1 parent e44567b commit a754a86
Show file tree
Hide file tree
Showing 285 changed files with 3,767 additions and 1,095 deletions.
2,228 changes: 1,353 additions & 875 deletions smw/gptwiki/export/glycosites.tsv

Large diffs are not rendered by default.

152 changes: 106 additions & 46 deletions smw/gptwiki/model/GPTWiki.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@

__all__ = [ "GPTWiki", "Glycan", "Protein", "Peptide", "TransitionGroup", "Transition", "Acquisition", "Alignment" ]
__all__ = [ "GPTWiki", "Glycan", "Protein", "Peptide", "TransitionGroup", "Transition", "Acquisition", "Alignment", "ProteinSite" ]

import sys, re, os, os.path, json
from operator import itemgetter
Expand Down Expand Up @@ -277,23 +277,66 @@ def toTemplate(self,data):

class Alignment(SMW.SMWClass):
template = 'Alignment'

def toPython(self,data):
data = super(Alignment,self).toPython(data)

for k in ('start','end'):
if isinstance(data.get(k),basestring):
data[k] = int(data.get(k))

if isinstance(data.get('site'),basestring):
data['site'] = list(map(ProteinSite.asProteinSite,set(data.get('site').split(','))))
data['site'].sort(ProteinSite.sortkey)

return data

def toTemplate(self,data):
data = super(Alignment,self).toTemplate(data)

if 'site' in data:
sites = dict()
for ps in data['site']:
sites[ProteinSite.pagename(**dict(ps.items()))] = ps
data['site'] = ",".join(map(lambda t: t[0],sorted(sites.items(),key=lambda t: ProteinSite.sortkey(t[1]))))

return data

@staticmethod
def sortkey(al):
return al.get('protein'),al.get('start')

class ProteinSite(SMW.SMWClass):
template = 'ProteinSite'
@staticmethod
def pagename(**kwargs):
assert kwargs.get('protein')
assert kwargs.get('aa')
assert kwargs.get('position')
return "%(protein)s@%(aa)s%(position)s"%kwargs

@staticmethod
def asProteinSite(s):
pr,rest = s.split('@')
position = int(rest[1:])
aa = rest[0]
return ProteinSite(protein=pr,position=position,aa=aa)

def toPython(self,data):
data = super(ProteinSite,self).toPython(data)
for k in ('position',):
if isinstance(data.get(k),basestring):
data[k] = int(data.get(k))
return data

def toTemplate(self,data):
data = super(ProteinSite,self).toTemplate(data)
return data

@staticmethod
def sortkey(ps):
return ps.get('protein'),ps.get('position')

class Acquisition(SMW.SMWClass):
template = 'Acquisition'

Expand Down Expand Up @@ -348,12 +391,10 @@ def lock_cache(self,clsname):
if self.has_lock(clsname):
return
filename = self.cache_file(clsname)
lock = FileLock(filename)
lock = FileLock(filename,threaded=False)
try:
lock.acquire(10)
except LockTimeout:
if self._cache_locks.get(clsname):
self._cache_locks[clsname].release()
raise RuntimeError("Can't obtain lock for %s cache file"%(clsname,))
self._cache_locks[clsname] = lock

Expand Down Expand Up @@ -386,6 +427,9 @@ def cache_ids(self,clsname):
def verify_ids(self,clsname,iterids):
return set(self.cache_ids(clsname)) == set(iterids)

def missing_ids(self,clsname,iterids):
return (set(iterids)-set(self.cache_ids(clsname)))

def add_to_cache(self,clsname,key,value):
self.lock_cache(clsname)
if self._cache.get(clsname) is None:
Expand All @@ -400,49 +444,64 @@ def is_in_cache(self,clsname,key):
return (self.get_from_cache(clsname,key) is not None)

def peptideindex(self):
self.lock_cache("peptides")
self.read_cache("peptides")
if self.has_cache("peptides"):
if self.verify_ids("peptides",self.iterpeptideids()):
self.release_cache("peptides")
return
print >>sys.stderr, "(Re-)computing peptide index from site..."
for p in self.iterpeptides():
key = Peptide.key(p.get('sequence'),p.get('glycan',[]),p.get('mod',[]))
self.add_to_cache("peptides",key,p.get('id'))
self.write_cache("peptides")
self.release_cache("peptides")
print >>sys.stderr, "(Re-)computing peptide index from site... done."
try:
self.lock_cache("peptides")
self.read_cache("peptides")
if self.has_cache("peptides"):
absent = self.missing_ids("peptides",self.iterpeptideids())
if len(absent) == 0:
return
else:
absent = set(self.iterpeptideids())
print >>sys.stderr, "Computing %d peptide keys from site..."%(len(absent))
for pid in absent:
p = self.get(pid)
key = Peptide.key(p.get('sequence'),p.get('glycan',[]),p.get('mod',[]))
self.add_to_cache("peptides",key,p.get('id'))
self.write_cache("peptides")
print >>sys.stderr, "Computing %d peptide keys from site... done."%(len(absent))
finally:
self.release_cache("peptides")

def tgindex(self):
self.lock_cache("tgs")
self.read_cache("tgs")
if self.has_cache("tgs"):
if self.verify_ids("tgs",self.itertransgroupids()):
self.release_cache("tgs")
return
print >>sys.stderr, "(Re-)computing transition group index..."
for tg in self.itertransgroups():
key = TransitionGroup.key(tg.get('peptide'),tg.get('z1'),tg.get('spectra'))
self.add_to_cache("tgs",key,tg.get('id'))
self.write_cache("tgs")
self.release_cache("tgs")
print >>sys.stderr, "(Re-)computing transition group index... done."
try:
self.lock_cache("tgs")
self.read_cache("tgs")
if self.has_cache("tgs"):
absent = self.missing_ids("tgs",self.itertransgroupids())
if len(absent) == 0:
return
else:
absent = set(self.itertransgroupids())
print >>sys.stderr, "Computing %d transition group keys from site..."%(len(absent))
for tgid in absent:
tg = self.get(tgid)
key = TransitionGroup.key(tg.get('peptide'),tg.get('z1'),tg.get('spectra'))
self.add_to_cache("tgs",key,tg.get('id'))
self.write_cache("tgs")
print >>sys.stderr, "Computing %d transition group keys from site... done."%(len(absent))
finally:
self.release_cache("tgs")

def transindex(self):
self.lock_cache("trans")
self.read_cache("trans")
if self.has_cache("trans"):
if self.verify_ids("trans",self.itertransitionids()):
self.release_cache("trans")
return
print >>sys.stderr, "(Re-)computing transition index..."
for t in self.itertransitions():
key = Transition.key(t.get('peptide'),t.get('z1'),t.get('label'),t.get('z2'))
self.add_to_cache("trans",key,t.get('id'))
self.write_cache("trans")
self.release_cache("trans")
print >>sys.stderr, "(Re-)computing transition index... done."
try:
self.lock_cache("trans")
self.read_cache("trans")
if self.has_cache("trans"):
absent = self.missing_ids("trans",self.itertransitionids())
if len(absent) == 0:
return
else:
absent = set(self.itertransitionids())
print >>sys.stderr, "Computing %d transition keys from site..."%(len(absent))
for tid in absent:
t = self.get(tid)
key = Transition.key(t.get('peptide'),t.get('z1'),t.get('label'),t.get('z2'))
self.add_to_cache("trans",key,t.get('id'))
self.write_cache("trans")
print >>sys.stderr, "Computing %d transition keys from site... done."%(len(absent))
finally:
self.release_cache("trans")

def addprotein(self,accession,**kw):
p = self.get(accession)
Expand Down Expand Up @@ -538,9 +597,10 @@ def iteridbyregex(self,regex,prefix=None):

def iterregex(self,regex,prefix=None):
regex = re.compile(regex)
for page in self.site.allpages(prefix=prefix,generator=True):
if regex.search(page.name):
yield self.parse_pagetext(page.name,page.text())
for pagename in self.site.allpages(prefix=prefix,generator=False):
if regex.search(pagename):
yield self.get(pagename)
# yield self.parse_template_text(page.name,page.text())

def iterglycans(self):
for g in self.iterregex(r'^G\d{5}[A-Z]{2}$',prefix="G"):
Expand Down
33 changes: 19 additions & 14 deletions smw/gptwiki/scripts/OpenSWATH.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Options:
Parameter file sets the follwing variables:
LCCALIBRATION=<lc-calibration>
TRANSITIONS=<Transition-Library-File-With-Decoys>
NDECOYS=<Number-of-Decoy-Replicates>
WINDOWS=<SWATH-Precursor-Windows-File>
Expand All @@ -33,7 +34,6 @@ EOF
DIR=`dirname $0`
DIR=`readlink -f $DIR`

LCCAL="NRT"
VERBOSE=0
OUTDIR="."
PARAMS="./params.txt"
Expand All @@ -48,24 +48,28 @@ while getopts ":c:o:vh" o ; do
esac
done

DONRT=0; DOEND=0; DOESI=0;
case $LCCAL in
NRT) DONRT=1;;
END) DOEND=1;;
ESI) DOESI=1;;
NONE) DONONE=1;;
*) help "Bad calibration (-c) parameter: $LCCAL";;
esac

shift $(($OPTIND - 1))

if [ ! -f "$PARAMS" ]; then
help "Parameter file \"$PARAMS\" could not be opened"
fi

PARAMS=`readlink -f "$PARAMS"`

export GPTDATA=/data2/projects/GlyGen/PyGly/smw/gptwiki/data
. $PARAMS

LCCAL=${LCCAL:-${LCCALIBRATION:-NRT}}

DONRT=0; DOEND=0; DOESI=0;
case $LCCAL in
NRT) DONRT=1;;
END) DOEND=1;;
ESI) DOESI=1;;
NONE) DONONE=1;;
*) help "Bad calibration (-c) parameter: $LCCAL";;
esac

if [ $DONRT = 1 -a "$NRTTRANSITIONS" = "" ]; then
help "NRT Peptide calibration: NRTTRANSITIONS missing from parameter file $1"
fi
Expand Down Expand Up @@ -98,10 +102,11 @@ if [ "$NDECOYS" = "" ]; then
NDECOYS=5
fi

OSW="/tools/openswath/bin/OpenSwathWorkflow.sh"
OSW="$DIR/OpenSwathWorkflow.sh"

function openswath() {
FILE="$1"
# FILE=`readlink -f "$1"`
shift
BASE=`basename "$FILE" .mzML.gz`
echo $OSW \
Expand Down Expand Up @@ -147,11 +152,11 @@ function openswath_esi() {
}

function getcoeff() {
apython $DIR/getoswcoeffs.py "$@"
python2 $DIR/getoswcoeffs.py "$@"
}

function fdr() {
apython $DIR/../analysis/fdr.py "$@"
python2 $DIR/../analysis/fdr.py "$@"
}

function openswath_cal() {
Expand Down Expand Up @@ -184,7 +189,7 @@ function openswath_cal() {
}

for f in "$@"; do
TMP=`mktemp -d -p /scratch -t openswath.XXXXXX`
TMP=`mktemp -d -p . -t .openswath.XXXXXX`
BASE=`basename "$f" .mzML.gz`
LOG="${OUTDIR}/${BASE}.log"
echo "" > "$LOG"
Expand Down
10 changes: 7 additions & 3 deletions smw/gptwiki/scripts/OpenSWATH2JSON.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
#!/bin/sh

set -x

help() {
if [ "$1" != "" ]; then
echo "" 1>&2
Expand Down Expand Up @@ -30,6 +28,8 @@ EOF
DIR=`dirname $0`
DIR=`readlink -f $DIR`

export GPTDATA=/data2/projects/GlyGen/PyGly/smw/gptwiki/data

VERBOSE=0
OUTDIR="."
FDR="1"
Expand Down Expand Up @@ -71,8 +71,12 @@ fi
function openswath2json() {
BASE0=`basename "$1" .mzML.gz`
BASE1=`basename "$1" .centroid.mzML.gz`
if [ "$BASE1" = "$1" ]; then
# .centroid is missing...
BASE1=`basename "$1" .mzML.gz`
fi
rm -rf "$OUTDIR/$BASE1"
apython $DIR/openswath2json.py --transitions "$TRANSITIONS" --ndecoys $NDECOYS --outdir "$OUTDIR/$BASE1" \
python2 $DIR/openswath2json.py --transitions "$TRANSITIONS" --ndecoys $NDECOYS --outdir "$OUTDIR/$BASE1" \
--chromatograms="${BASE0}_chrom.mzML" --results "${BASE0}_table.tsv" \
--fdr $FDR --score $SCORE
mkdir -p "$OUTDIR/$BASE1"
Expand Down
1 change: 1 addition & 0 deletions smw/gptwiki/scripts/OpenSwathWorkflow.sh
21 changes: 21 additions & 0 deletions smw/gptwiki/scripts/addproteinsite.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/bin/env python2

from getwiki import GPTWiki, Peptide, ProteinSite

import sys

w = GPTWiki()
for pep in w.iterpep():
for al in pep.get('alignments',[]):
pr = al.get('protein')
prsites = al.get('prsites',"").split('|')
for prs in prsites:
aa = prs[0]
pos = int(prs[1:])
ps = ProteinSite(protein=pr,aa=aa,position=pos)
if w.put(ps):
print ProteinSite.pagename(protein=pr,aa=aa,position=pos)
al.append('site',ps)
# al.delete('prsites')
if w.put(pep):
print pep.get('id')
15 changes: 7 additions & 8 deletions smw/gptwiki/scripts/dumpglycosites.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#!/bin/env python27

#!/bin/env python2

import sys
from collections import defaultdict
Expand Down Expand Up @@ -39,19 +38,19 @@
for al in aligns:
pr = al.get('protein')
site = al.get('prsites')
if glyid not in prsites[(pr,int(site[1:]))]:
prsites[(pr,int(site[1:]))][glyid] = dict(aa=tla[site[0]],name=name,cls=cls,pep=set([pep.get('id')]))
if glyid not in prsites[(pr,site[0],int(site[1:]))]:
prsites[(pr,site[0],int(site[1:]))][glyid] = dict(aa=tla[site[0]],name=name,cls=cls,pep=set([pep.get('id')]))
else:
prsites[(pr,int(site[1:]))][glyid]['pep'].add(pep.get('id'))
prsites[(pr,site[0],int(site[1:]))][glyid]['pep'].add(pep.get('id'))

headers = ['UniProt','Site','AminoAcid', 'GlyTouCan', 'Composition', 'GlycoType','Glycopeptides']
headers = ['UniProt','Site','AminoAcid', 'GlyTouCan', 'Composition', 'GlycoType','Glycopeptides','SiteLink']

print "\t".join(headers)
for (pracc,site),glycans in sorted(prsites.items()):
for (pracc,aa,site),glycans in sorted(prsites.items()):
for glyid,data in sorted(glycans.items()):
pepstr = ",".join(sorted(data['pep']))
row = {'UniProt': pracc, 'Site': site,
'AminoAcid': data['aa'], 'GlyTouCan': glyid,
'Composition': data['name'], 'GlycoType': data['cls'],
'Glycopeptides': pepstr}
'Glycopeptides': pepstr, 'SiteLink': "%s@%s%s"%(pracc,aa,site)}
print "\t".join(map(str,map(row.get,headers)))
Loading

0 comments on commit a754a86

Please sign in to comment.