Skip to content

Commit bfc699e

Browse files
author
Robert Butler
authored
Merge pull request #13 from rbutleriii/weights
final checks for 1.4
2 parents af18cfb + b0ed459 commit bfc699e

File tree

5 files changed

+70
-43
lines changed

5 files changed

+70
-43
lines changed

clinotator/clinotator.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,8 @@ def getargs():
4848
description='Clinical interpretation of ambiguous'
4949
' ClinVar annotations')
5050
parser.add_argument('--log', action='store_true', help='create logfile')
51-
parser.add_argument('--long-log', action='store_true', help='create detailed logfile')
51+
parser.add_argument('--long-log', action='store_true',
52+
help='create detailed logfile')
5253
parser.add_argument('-o', metavar='prefix', dest='outprefix',
5354
default='clinotator',
5455
help='choose an alternate prefix for outfiles')
@@ -85,12 +86,13 @@ def input_selection(file_type, file, outprefix, query_results):
8586
with open(file) as f:
8687

8788
if file_type == 'vid':
88-
id_list = np.unique([line.rstrip('\n') for line in f])
89+
id_list = np.unique([line.rstrip('\r\n') for line in f])
8990
getncbi.get_ncbi_xml(file_type, id_list, query_results)
9091
return False
9192

9293
elif file_type == 'rsid':
93-
id_list = np.unique([line.lstrip('rsRS').rstrip('\n') for line in f])
94+
id_list = np.unique([line.lstrip('rsRS')
95+
.rstrip('\r\n') for line in f])
9496
getncbi.get_ncbi_xml(file_type, id_list, query_results)
9597
return False
9698

@@ -125,7 +127,7 @@ def explode(df, lst_cols, fill_value=''):
125127
col:np.repeat(df[col].values, df[lst_cols[0]].str.len())
126128
for col in idx_cols
127129
}).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \
128-
.loc[:, df.columns]
130+
.loc[:, df.columns]
129131
else:
130132
# at least one list in cells is empty
131133
return pd.DataFrame({

clinotator/getncbi.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -129,7 +129,7 @@ def batch_local(file_type, query_type, id_list, **kwargs):
129129
fetch_handle.close()
130130
try:
131131
result_list.extend(
132-
[link['Id'] for link in record[0]['LinkSetDb'][0]['Link']])
132+
[lnk['Id'] for lnk in record[0]['LinkSetDb'][0]['Link']])
133133
except IndexError:
134134
logging.info('No VIDs for rsIDs {} to {}'.format(start + 1, end))
135135
pass

clinotator/global_vars.py

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
See main, eventually tests will be added for this module
1111
'''
1212

13-
__version__ = "1.3.2"
13+
__version__ = "1.4.0"
1414

1515

1616
### getncbi.py global variables
@@ -69,31 +69,37 @@
6969

7070
# dict of reviewer status weights for each assertion
7171
cutoff = {'practice guideline': 1.25,
72-
'reviewed by expert panel': 1.10,
72+
'reviewed by expert panel': 1.20,
7373
'criteria provided, single submitter': 1.0,
7474
'no assertion for the individual variant': 0.0,
7575
'no assertion criteria provided': 0.0,
7676
'no assertion provided': 0.0}
7777

7878
# dict of assertion weights for scoring
79-
significance = {'Benign': (-6, 'B'),
80-
'Likely benign': (-3, 'LB'),
79+
significance = {'Benign': (-6.46, 'B'),
80+
'benign': (-6.46, 'B'), # submission key error
81+
'Likely benign': (-3.23, 'LB'),
82+
'likely benign': (-3.23, 'LB'),
83+
'Likely Benign': (-3.23, 'LB'), # submission key error
8184
'Uncertain significance': (-0.3, 'US'),
82-
'Uncertain Significance': (-0.3, 'US'),
83-
'Likely pathogenic': (3, 'LP'),
84-
'Pathogenic': (6, 'P'),
85+
'Uncertain Significance': (-0.3, 'US'), # submission key error
86+
'Likely pathogenic': (3.23, 'LP'),
87+
'likely pathogenic': (3.23, 'LP'),
88+
'Likely Pathogenic': (3.23, 'LP'), # submission key error
89+
'Pathogenic': (6.46, 'P'),
90+
'pathogenic': (6.46, 'P'), # submission key error
8591
'drug response': (0, '-'), 'association': (0, '-'),
8692
'risk factor': (0, '-'), 'protective': (0, '-'),
8793
'Affects': (0, '-'),
8894
'conflicting data from submitters': (0, '-'),
8995
'other': (0, '-'), 'not provided': (0, '-')}
9096

9197
# list of weighted score upper bounds for ctps bins
92-
ctps_cutoffs = [('Benign', -26.7),
98+
ctps_cutoffs = [('Benign', -21.318), # lower PI bound for LB
9399
('Benign/Likely benign', -8.4),
94100
('Likely benign', -4.2),
95101
('Uncertain significance', 4.2),
96102
('Likely pathogenic', 8.4),
97-
('Pathogenic/Likely pathogenic', 14.7),
103+
('Pathogenic/Likely pathogenic', 14.858), # upper PI for LP
98104
('Pathogenic', 10000000)]
99105

clinotator/variation.py

Lines changed: 38 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -107,8 +107,8 @@ class VariationClass:
107107
def __init__(self, variationreport):
108108
self.VID = variationreport.attrib['VariationID']
109109
self.CVVT = variationreport.attrib['VariationType']
110-
reviewstat = variationreport.find('.InterpretedRecord/ReviewStatus').text
111-
self.CVSZ = g.star_dict[reviewstat]
110+
revstat = variationreport.find('.InterpretedRecord/ReviewStatus').text
111+
self.CVSZ = g.star_dict[revstat]
112112
self.allele_parse(variationreport)
113113
self.observation_parse(variationreport)
114114
self.assertion_table_stats(variationreport)
@@ -128,18 +128,19 @@ def allele_parse(self, variationreport):
128128
Alt = []
129129
vcf_match = []
130130

131-
for index, alleles in enumerate(variationreport.findall(self.haplos(variationreport))):
131+
for index, alleles in enumerate(variationreport
132+
.findall(self.haplos(variationreport))):
132133

133134
try:
134135
RS.append(alleles.find('./XRefList/XRef[@DB="dbSNP"]')
135-
.get('ID'))
136+
.get('ID'))
136137
except:
137138
RS.append('.')
138139

139140
try:
140141
Alt.append(alleles
141-
.find('./Location/SequenceLocation[@Assembly="GRCh38"]')
142-
.get('alternateAlleleVCF'))
142+
.find('./Location/SequenceLocation[@Assembly="GRCh38"]')
143+
.get('alternateAlleleVCF'))
143144
except:
144145
Alt.append('.')
145146

@@ -166,13 +167,18 @@ def observation_parse(self, variationreport):
166167
run_already = True
167168
self.CVCS = interpretation \
168169
.find('./Description').text
169-
self.CVLE = interpretation.attrib['DateLastEvaluated']
170+
try:
171+
self.CVLE = interpretation.attrib['DateLastEvaluated']
172+
except KeyError as e:
173+
self.CVLE = '.'
174+
logging.warning('VID {} doesn\'t have a DateLastEvaluated!'
175+
.format(self.VID))
170176

171-
elif (interpretation.get('VariationID') == self.VID and
172-
run_already):
173-
logging.warning('{} has multiple interpretation fields in its re'
174-
'cord omitting as an annotation error. Check rsi'
175-
'd(s) {} manually'.format(self.VID, self.rsID))
177+
elif interpretation.get('VariationID') == self.VID and run_already:
178+
logging.warning('{} has multiple interpretation fields in its '
179+
'record omitting as an annotation error. Check'
180+
' rsid(s) {} manually'.format(self.VID,
181+
self.rsID))
176182
continue
177183

178184
else:
@@ -188,7 +194,8 @@ def pheno_parse(self, assertion, sig_key):
188194

189195
if not pheno_list:
190196
pheno_list.append('{}({})'.format("Not_Provided", sig_key))
191-
logging.debug('Disease list for {}: {}'.format(assertion.attrib['ID'], pheno_list))
197+
logging.debug('Disease list for {}: {}'.format(assertion.attrib['ID'],
198+
pheno_list))
192199
return pheno_list
193200

194201
# parse the ClinicalAssertionList subtree of variation report
@@ -197,11 +204,14 @@ def assertion_table_stats(self, variationreport):
197204
age_list = []
198205
cvds_list = []
199206

200-
for assertion in variationreport.findall('./InterpretedRecord/ClinicalAssertionList/ClinicalAssertion'):
207+
for assertion in variationreport.findall(
208+
'./InterpretedRecord/ClinicalAssertionList/ClinicalAssertion'):
201209
observ_set = {"germline", "de novo", "maternal", "paternal",
202210
"inherited", "unknown", "uniparental", "biparental"}
203-
observ_list = {x.text.lower() for x in assertion.findall('./ObservedInList/ObservedIn/Sample/Origin')}
204-
logging.debug('Origin List for {}: {}'.format(assertion.attrib['ID'], observ_list))
211+
observ_list = {x.text.lower() for x in assertion
212+
.findall('./ObservedInList/ObservedIn/Sample/Origin')}
213+
logging.debug('Origin List for {}: {}'
214+
.format(assertion.attrib['ID'], observ_list))
205215
try:
206216
assert len(observ_set.intersection(observ_list)) > 0
207217
revstat_key = assertion.find('ReviewStatus').text
@@ -211,28 +221,32 @@ def assertion_table_stats(self, variationreport):
211221
try:
212222
sig_value = key_test(g.significance, sigval_key)
213223
except:
214-
logging.warn('Assertion {} for VID {} is incorrectly formatted'.format(assertion.attrib['ID'], self.VID))
224+
logging.warn('Assertion {} for VID {} is incorrectly forma'
225+
'tted'.format(assertion.attrib['ID'], self.VID))
215226
continue
216227

217228
if score > 0 and sig_value[0] != 0:
218229
try:
219230
age = calculate_age(assertion.find('./Interpretation')
220231
.get('DateLastEvaluated'))
221232
except:
222-
logging.warning('{} has a missing assertion date!'
223-
.format(self.VID))
233+
logging.debug('Assertion {} for VID {} is missing an a'
234+
'ssertion date!'.format(
235+
assertion.attrib['ID'],self.VID))
224236
continue
225237

226238
age_list.append(age)
227239
D = decimal.Decimal
228240
raw_score.append(float(D(str(score)) * D(str(sig_value[0]))
229241
* D(str(age_weight(age)))))
230-
logging.debug('score: {} sig_value: {} age_weight: {} age: {}'
231-
.format(score, sig_value[0], age_weight(age), age))
242+
logging.debug('score: {} sig_value: {} age_weight: {} age:'
243+
' {}'.format(score, sig_value[0],
244+
age_weight(age), age))
232245

233246
cvds_list += self.pheno_parse(assertion, sig_value[1])
234247
except AssertionError as a:
235-
logging.debug('no germline reports for assertion {}, skipping'.format(assertion.attrib['ID']))
248+
logging.debug('no germline reports for assertion {}, skipping'
249+
.format(assertion.attrib['ID']))
236250
continue
237251

238252
self.CVDS = ';'.join(cvds_list)
@@ -280,8 +294,8 @@ def analysis_stats(self):
280294
'first one!'.format(self.VID))
281295

282296
if cvcs_index is None:
283-
logging.warning('ClinVar significance for {} does not include B,B/LB'
284-
',LB,US,LP,LP/P,P'.format(self.VID))
297+
logging.warning('ClinVar significance for {} does not include B,B/'
298+
'LB,LB,US,LP,LP/P,P'.format(self.VID))
285299
self.CTPS = None
286300
self.CTRR = '.'
287301
return

test/test.tbl

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
VID CVVT rsID CVAL vcfmatch CVCS CVSZ CVNA CVDS CVLE CTRS CTAA CTPS CTRR
22
7 Haplotype 118161496 C ['118161496|C', '200401432|A'] Pathogenic 1 1 OMIM:252010(P) 2017-09-01 . 2.0 . .
33
7 Haplotype 200401432 A ['118161496|C', '200401432|A'] Pathogenic 1 1 OMIM:252010(P) 2017-09-01 . 2.0 . .
4-
9 single nucleotide variant 1800562 A . Conflicting interpretations of pathogenicity, other 1 13 OMIM:235200(P);Not_Provided(P);MedGen:C0392514(P);Orphanet:ORPHA79230(P);MedGen:C0392514(P);MedGen:C0027672(P);HP:HP:0000992(US);HP:HP:0010473(US);OMIM:235200(P);OMIM:235200(P);OMIM:235200(P);OMIM:235200(LP);OMIM:235200(P);OMIM:235200(P) 2019-05-28 58.02 2.5384615384615383 Pathogenic 3
4+
9 single nucleotide variant 1800562 A . Conflicting interpretations of pathogenicity, other 1 13 OMIM:235200(P);Not_Provided(P);MedGen:C0392514(P);Orphanet:ORPHA79230(P);MedGen:C0392514(P);MedGen:C0027672(P);HP:HP:0000992(US);HP:HP:0010473(US);OMIM:235200(P);OMIM:235200(P);OMIM:235200(P);OMIM:235200(LP);OMIM:235200(P);OMIM:235200(P) 2019-05-28 62.482 2.5384615384615383 Pathogenic 3
55
11 single nucleotide variant 1800730 T . Uncertain significance 2 4 OMIM:104300(US);OMIM:176100(US);OMIM:176200(US);OMIM:235200(US);OMIM:612635(US);OMIM:614193(US);MedGen:C0392514(US);OMIM:235200(US);MedGen:CN517202(US) 2018-12-15 -1.17 1.25 Uncertain significance 0
6-
14125 single nucleotide variant 267606908 C . Pathogenic 3 10 MedGen:CN230736(P);OMIM:160500(P);OMIM:181430(P);OMIM:192600(P);OMIM:255160(P);OMIM:255310(P);OMIM:608358(P);OMIM:613426(P);Not_Provided(P);Orphanet:ORPHA217569(P);OMIM:192600(P);MedGen:C0949658(P);Not_Provided(P);Orphanet:ORPHA217569(P);MedGen:C0949658(P);MedGen:C0007194(P) 2016-12-15 56.28 1.7 Pathogenic 0
7-
50317 single nucleotide variant 118161496 C . Conflicting interpretations of pathogenicity 1 5 Not_Provided(US);Not_Provided(P);MeSH:D030342(P);OMIM:252010(LP);MedGen:CN517202(US) 2017-08-14 11.22 3.0 Pathogenic/Likely pathogenic 3
8-
127994 Microsatellite 587780147 TGAGATAA . Pathogenic 2 2 MedGen:C0027672(P);MeSH:D009386(P);MedGen:C0027672(P) 2016-06-10 7.8 4.5 Likely pathogenic 2
6+
3521 single nucleotide variant 1801131 G . Conflicting interpretations of pathogenicity, other 1 5 Not_Provided(B);MedGen:CN169374(B);Not_Provided(LB);OMIM:601634(B);MedGen:CN517202(US) 2019-05-28 -20.912 2.0 Benign/Likely benign 3
7+
8178 single nucleotide variant 121909293 T . Conflicting interpretations of pathogenicity, risk factor 1 2 OMIM:167800(LP);Not_Provided(LB) 2018-12-14 0.6459999999999999 2.0 Uncertain significance 0
8+
14125 single nucleotide variant 267606908 C . Pathogenic 3 10 MedGen:CN230736(P);OMIM:160500(P);OMIM:181430(P);OMIM:192600(P);OMIM:255160(P);OMIM:255310(P);OMIM:608358(P);OMIM:613426(P);Not_Provided(P);Orphanet:ORPHA217569(P);OMIM:192600(P);MedGen:C0949658(P);Not_Provided(P);Orphanet:ORPHA217569(P);MedGen:C0949658(P);MedGen:C0007194(P) 2016-12-15 61.1116 1.7 Pathogenic 0
9+
50317 single nucleotide variant 118161496 C . Conflicting interpretations of pathogenicity 1 5 Not_Provided(US);Not_Provided(P);MeSH:D030342(P);OMIM:252010(LP);MedGen:CN517202(US) 2017-08-14 12.117 3.0 Pathogenic/Likely pathogenic 3
10+
55794 single nucleotide variant 180177040 G . Likely pathogenic 0 0 . 2016-05-31 . . . .
11+
127994 Microsatellite 587780147 TGAGATAA . Pathogenic 2 2 MedGen:C0027672(P);MeSH:D009386(P);MedGen:C0027672(P) 2016-06-10 8.398 4.5 Likely pathogenic 2
12+
128294 single nucleotide variant 113288277 T . Benign 2 3 Not_Provided(B);Not_Provided(B);MedGen:C3808739(B) 2017-07-28 -14.212 3.6666666666666665 Benign/Likely benign 1
13+
128297 single nucleotide variant 2465128 G . Benign 2 2 Not_Provided(B);Not_Provided(B) 2016-01-19 -7.752000000000001 5.0 Likely benign 2
914
214885 single nucleotide variant 200401432 A . Uncertain significance 2 2 MeSH:D030342(US);Not_Provided(US) 2017-12-29 -0.54 2.0 Uncertain significance 0
10-
230850 Microsatellite 587780147 T . Pathogenic 2 2 MedGen:C0027672(P);MedGen:C0027672(P) 2018-10-15 11.4 1.5 Pathogenic/Likely pathogenic 1
15+
225696 single nucleotide variant 1057517686 T . Pathogenic/Likely pathogenic 2 2 Not_Provided(LP);Not_Provided(P) 2018-02-08 8.398 2.5 Likely pathogenic 1

0 commit comments

Comments
 (0)