diff --git a/.gitignore b/.gitignore index 344a6e8..17a5b3c 100644 --- a/.gitignore +++ b/.gitignore @@ -5,8 +5,6 @@ __pycache__ # specific folders data/* -tests/* -!test_lovd_fill_hg38.py # large gene reference files *.fa \ No newline at end of file diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 8bf5e30..0293adb 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -181,66 +181,50 @@ def lovd_fill_hg38(lovd: pd.DataFrame): """ Fills missing hg38 values in the LOVD dataframe by converting hg19 values to hg38. - New column 'hg19/hg38_lovd' is added to store + New column 'hg38_gnomad_format' is added to store the converted positions in the format '6-position-ref-alt'. :param lovd: pandas DataFrame containing following columns: - 'VariantOnGenome/DNA': hg19 values. - 'VariantOnGenome/DNA/hg38': hg38 values. - :return: None: Modifies the input DataFrame in-place by adding or - updating the 'hg19/hg38_lovd' column. + :return: None: Modifies the input DataFrame in-place by adding + 'hg38_gnomad_format' column. """ if lovd.empty: return - lovd['VariantOnGenome/DNA/hg38'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) - lovd['hg38_gnomad_format'] = lovd.apply(convert_hg19_if_missing, axis=1) + lovd['hg38_gnomad_format'] = lovd['VariantOnGenome/DNA/hg38'].replace('', pd.NA) + missing_hg38_mask = lovd['hg38_gnomad_format'].isna() + lovd.loc[missing_hg38_mask, 'hg38_gnomad_format'] = lovd.loc[missing_hg38_mask, 'VariantOnGenome/DNA'].apply( + convert_hg19_if_missing) + lovd['hg38_gnomad_format'] = lovd['hg38_gnomad_format'].apply(convert_to_gnomad_gen) -def convert_hg19_if_missing(row): +def convert_hg19_if_missing(hg19: pd.Series, lo = LiftOver('hg19', 'hg38')): """ - converts hg19 variant to hg38 if hg38 is missing. - Checks if the hg38 value is missing (NaN) in a given row. - If it is, the hg19 variant is converted to hg38 - using the `convert_hg19_to_hg38` function. - Otherwise, the existing hg38 value is formatted. - :param row: single row of the DataFrame. - :return: - - str: hg38 value or a conversion of - the hg19 value in the format '6-position-ref-alt'. + Converts hg19 variant to hg38 if hg38 is missing. + :param hg19: a row from the DataFrame. + :param lo: converter for genomic data between reference assemblies + :return: hg38 value or a conversion of the hg19 value in the format 'g.positionref>alt'. """ - if pd.isna(row['VariantOnGenome/DNA/hg38']): - return convert_hg19_to_hg38(convert_to_gnomad_gen(row['VariantOnGenome/DNA'])) - return convert_to_gnomad_gen(row['VariantOnGenome/DNA/hg38']) - - -def convert_hg19_to_hg38(position: str, lo=LiftOver('hg19', 'hg38')): - """ - converts a genomic position from hg19 to hg38 using the LiftOver tool. - :param position: string representing the hg19 variant - in the format 'g.positionRef>Alt'. - :param lo: converter for coordinates between genome builds - :return: string converted hg38 position in the format '6-position-ref-alt'. - """ - - if '?' in position: + if pd.isna(hg19): + return "?" + if '?' in hg19 or "_" in hg19: return '?' - new_pos = lo.convert_coordinate('chr6', int(position[2:10]))[0][1] - return f"6-{new_pos}-{position[-3:]}" + position_str = hg19[2:10] + new_pos = lo.convert_coordinate('chr6', int(position_str))[0][1] + return f"g.{new_pos}{hg19[-3:]}" def convert_to_gnomad_gen(variant: str): """ - converts a variant string from hg19 or hg38 format + converts a variant string from hg38 format to the format used by gnomAD ('6-position-ref-alt'). :param variant: str: the variant in the format 'g.startRef>Alt'. :return: str: variant formatted as '6-position-ref-alt' or '?' if the input contains interval ranges or is invalid. """ - if not isinstance(variant, str): - raise TypeError(f"Expected a string for 'variant', got {type(variant).__name__} instead") - patterns = { 'dup': re.compile(r'^g\.(\d+)dup$'), 'del': re.compile(r'^g\.(\d+)del$'), diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 6cdacd5..69c80f3 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -4018,8 +4018,8 @@ "metadata": { "collapsed": false, "ExecuteTime": { - "end_time": "2024-09-02T18:03:53.205784Z", - "start_time": "2024-09-02T18:03:43.273240Z" + "end_time": "2024-09-03T07:46:50.003340Z", + "start_time": "2024-09-03T07:46:40.913231Z" } }, "id": "dd9b17623f26a07c", diff --git a/tests/test_lovd_fill_hg38.py b/tests/test_lovd_fill_hg38.py index 822fa3a..c4008d7 100644 --- a/tests/test_lovd_fill_hg38.py +++ b/tests/test_lovd_fill_hg38.py @@ -82,15 +82,6 @@ def test_fill_hg38_no_variants(self): lovd_fill_hg38(self.df) self.assertEqual(self.df.shape[0], 0, "Empty dataframe should not add rows.") - def test_fill_hg38_NA_variants(self): - """Test filling hg38 values when there pd. NA variants in the dataframe.""" - self.df = pd.DataFrame({ - 'VariantOnGenome/DNA': [pd.NA], - 'VariantOnGenome/DNA/hg38': [pd.NA] - }) - with self.assertRaises(TypeError) as context: - lovd_fill_hg38(self.df) - self.assertEqual(str(context.exception), "Expected a string for 'variant', got NAType instead") if __name__ == '__main__':