@@ -49,7 +49,7 @@ def parse_args():
49
49
50
50
# create index from GVF
51
51
# make empty index df
52
- index_cols = ['pos' , 'mutation' , 'hgvs_aa_mutation' , 'hgvs_nt_mutation' , 'gene ' , 'protein_name' , 'alias' , 'hgvs_alias' , 'alias_protein' , 'Pokay_annotation' , 'lineages' ]
52
+ index_cols = ['pos' , 'mutation' , 'hgvs_aa_mutation' , 'hgvs_nt_mutation' , 'gene_name ' , 'gene_symbol' , ' protein_name' , 'protein_symbol ' , 'alias' , 'hgvs_alias' , 'alias_protein' , 'Pokay_annotation' , 'lineages' ]
53
53
index = pd .DataFrame (np .empty ((gvf .shape [0 ], len (index_cols ))), columns = index_cols )
54
54
# populate index df with gvf info
55
55
index ['pos' ] = gvf ['#start' ]
@@ -60,8 +60,10 @@ def parse_args():
60
60
index ['hgvs_alias' ] = gvf ['hgvs_alias' ]
61
61
index ['alias_protein' ] = 'n/a'
62
62
index .loc [index ['alias' ]!= 'n/a' , 'alias_protein' ] = gvf ['mat_pep' ]
63
- index ['gene' ] = gvf ['gene' ]
63
+ index ['gene_name' ] = gvf ['gene_name' ]
64
+ index ['gene_symbol' ] = gvf ['gene_symbol' ]
64
65
index ['protein_name' ] = gvf ['protein_name' ]
66
+ index ['protein_symbol' ] = gvf ['protein_symbol' ]
65
67
index ['Pokay_annotation' ] = gvf ["function_description" ].notna ()
66
68
index ['lineages' ] = gvf ['viral_lineage' ]
67
69
# tidying
@@ -73,7 +75,7 @@ def parse_args():
73
75
# create log from index
74
76
log = index .copy ()
75
77
# fill in 'new_mutations' column like: "gene:mutation"
76
- log ['new_mutations' ] = log ["gene " ] + ":" + log ["mutation" ]
78
+ log ['new_mutations' ] = log ["gene_symbol " ] + ":" + log ["mutation" ]
77
79
# for orf1ab mutations, fill in 'new_mutations' column like: "gene:mutation / nsp:alias"
78
80
log .loc [log ['alias' ]!= 'n/a' , 'new_mutations' ] = log ['new_mutations' ] + " / " + log ["alias_protein" ] + ":" + log ["alias" ]
79
81
# drop duplicates (there shouldn't be any)
0 commit comments