Skip to content

Commit

Permalink
Added ability to switch between using manual curation and not
Browse files Browse the repository at this point in the history
  • Loading branch information
Vedanth-Ramji authored Nov 14, 2023
1 parent 4995c3b commit c8d020e
Showing 1 changed file with 23 additions and 18 deletions.
41 changes: 23 additions & 18 deletions argnorm/normalizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,11 +44,12 @@ class BaseNormalizer:
Inherit this class and customize subclass methods to implement the normalization of tools.
"""

def __init__(self, database=None, is_hamronized=False, mode=None) -> None:
def __init__(self, database=None, is_hamronized=False, mode=None, uses_manual_curation=True) -> None:
self.tool = ''
self.database = database
self.mode = mode
self.is_hamronized = is_hamronized
self.uses_manual_curation = uses_manual_curation
self._set_input_gene_col()
self._set_ref_gene_and_aro_cols()

Expand Down Expand Up @@ -123,27 +124,31 @@ def get_aro_mapping_table(self):
"""
df = pd.read_csv(get_data_path(f'{self.tool}_{self.database}_{self.mode}_ARO_mapping.tsv', False), sep='\t', index_col=0)

if self.database == 'sarg' and self.mode == 'orfs':
gene_identifier = 'Categories_in_database'
else:
gene_identifier = 'Original ID'
if self.uses_manual_curation:
if self.database == 'sarg' and self.mode == 'orfs':
gene_identifier = 'Categories_in_database'
else:
gene_identifier = 'Original ID'

if self.database == 'ncbi':
manual_curation = pd.read_csv(get_data_path('ncbi_manual_curation.tsv', True), sep='\t')
elif self.database == 'resfinder':
manual_curation = pd.read_csv(get_data_path('resfinder_manual_curation.tsv', True), sep='\t')
else:
manual_curation = pd.read_csv(get_data_path(f'{self.tool}_{self.database}_{self.mode}_manual_curation.tsv', True), sep='\t')
if self.database == 'ncbi':
manual_curation = pd.read_csv(get_data_path('ncbi_manual_curation.tsv', True), sep='\t')
elif self.database == 'resfinder':
manual_curation = pd.read_csv(get_data_path('resfinder_manual_curation.tsv', True), sep='\t')
else:
manual_curation = pd.read_csv(get_data_path(f'{self.tool}_{self.database}_{self.mode}_manual_curation.tsv', True), sep='\t')

aro_nan_indices = [(list(df[gene_identifier]).index(manual_curation.loc[i, gene_identifier])) for i in range(manual_curation.shape[0])]
aro_nan_indices = [(list(df[gene_identifier]).index(manual_curation.loc[i, gene_identifier])) for i in range(manual_curation.shape[0])]

for i in range(len(aro_nan_indices)):
df.loc[aro_nan_indices[i], 'ARO'] = manual_curation.loc[i, 'ARO Replacement']
for i in range(len(aro_nan_indices)):
df.loc[aro_nan_indices[i], 'ARO'] = manual_curation.loc[i, 'ARO Replacement']

if self.tool != 'argsoap' and self.mode != 'orfs':
df.loc[aro_nan_indices[i], 'Gene Name in CARD'] = manual_curation.loc[i, 'Gene Name in CARD']
if self.tool != 'argsoap' or self.mode != 'orfs':
df[TARGET_ARO_COL] = df[TARGET_ARO_COL].map(lambda a: f'ARO:{int(float(a)) if is_number(a) == True else a}')
if self.tool != 'argsoap' and self.mode != 'orfs':
df.loc[aro_nan_indices[i], 'Gene Name in CARD'] = manual_curation.loc[i, 'Gene Name in CARD']
if self.tool != 'argsoap' or self.mode != 'orfs':
df[TARGET_ARO_COL] = df[TARGET_ARO_COL].map(lambda a: f'ARO:{int(float(a)) if is_number(a) == True else a}')
else:
if self.tool != 'argsoap' or self.mode != 'orfs':
df[TARGET_ARO_COL] = df[TARGET_ARO_COL].map(lambda a: f'ARO:{int(a) if a == a else "nan"}') # a == a checks that a is not nan

return df

Expand Down

0 comments on commit c8d020e

Please sign in to comment.