Skip to content

Commit 0e3bbac

Browse files
authored
Merge pull request #35 from SietsmaRJ/master
Exposed transcript identifier in CAPICE output
2 parents b520d41 + 2469fd6 commit 0e3bbac

File tree

8 files changed

+28
-61
lines changed

8 files changed

+28
-61
lines changed

README.md

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ The following sections will guide you through the steps needed for the variant a
3333
making predictions using the CAPICE model.
3434

3535
### Download and installation (UNIX like systems)
36-
__Note: this install is for Python 3.7 and Python 3.8.
36+
__Note: this install is for Python 3.7, Python 3.8 and Python 3.9.
3737
Python 3.6 is also supported and install can be found at the bottom of this chapter.
38-
Python 3.5 and lower or Python 3.9 and higher is not supported (yet).__
38+
Python 3.5 and lower is not supported.__
3939

4040
1. Software and libraries
4141
CAPICE scripts can be downloaded from the CAPICE github repository.
@@ -95,7 +95,18 @@ CAPICE requires the following arguments:
9595
- -i / --input: The path to the input [CADD annotated](https://cadd.gs.washington.edu/) dataset using the tab separator (can be both gzipped or not). An example of an input TSV file can be found in `CAPICE_example/test_cadd14_grch37_annotated.tsv.gz` for CADD 1.4 and genome build 37.
9696

9797
The following flags are optional:
98-
- -o / --output: The path to the directory, output filename or output directory and filename where the output is placed (will be made if it does not exists). If only a filename is supplied, or no output is supplied, the file will be placed within the input directory. __The file will always be gzipped!__
98+
- -o / --output: The path to the directory, output filename or output directory and filename where the output is placed (will be made if it does not exists). If only a filename is supplied, or no output is supplied, the file will be placed within the input directory. __The file will always be gzipped with a .gz extension!__
99+
100+
_For instance:_
101+
102+
`-i input.txt` becomes `input_capice.txt.gz`
103+
104+
`-i input.txt -o output.txt` becomes `output.txt.gz`
105+
106+
`-i input.txt -o path/to/output.txt` becomes `path/to/output.txt.gz`
107+
108+
`-i input.txt -o path/to/output` becomes `path/to/output/input_capice.txt.gz`
109+
99110
- -v / --verbose: Display more in depth messages within the progress of CAPICE.
100111
- -f / --force: Overwrite an output file if already present (does NOT work for logfiles).
101112
- --train: Activates the 'train new CAPICE-like models' within CAPICE.
@@ -108,7 +119,6 @@ A file will be put out containing the following columns:
108119

109120
- __No__ index
110121
- chr_pos_ref_alt: column containing the chromosome, position, reference and alternative separated by an underscore.
111-
- ID: Column full of `.`.
112122
- GeneName: The ENSEMBL gene name of the variant as supplied by CADD.
113123
- FeatureID: The ENSEMBL feature ID (Transcript ID or regulatory feature ID).
114124
- Consequence: The type of consequence that the variant has as supplied by CADD.

src/main/python/core/exporter.py

Lines changed: 1 addition & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
import os
66
import pandas as pd
77
import pickle
8-
import warnings
98

109

1110
class Exporter:
@@ -31,34 +30,9 @@ def export_capice_prediction(self, datafile: pd.DataFrame):
3130
:param datafile: prediction pandas DataFrame
3231
"""
3332
filename = self._export_filename_ready(file_name=self.capice_filename, check_extension=False)
34-
# datafile[self.export_cols].to_csv(filename, sep='\t', index=False)
35-
datafile = self._export_legacy_prediction(datafile=datafile)
36-
datafile.to_csv(filename, sep='\t', index=False)
33+
datafile[self.export_cols].to_csv(filename, sep='\t', compression='gzip', index=False)
3734
self.log.info('Successfully exported CAPICE datafile to: {}'.format(filename))
3835

39-
def _export_legacy_prediction(self, datafile):
40-
warnings.warn('Using legacy export function, deprecated in 2.1.', DeprecationWarning)
41-
datafile = datafile[self.export_cols]
42-
43-
# Required to prevent the SettingWithCopyWarning, even when using:
44-
# dataframe.loc[row_indexer,col_indexer] = value
45-
pd.options.mode.chained_assignment = None
46-
47-
datafile.loc[:, Column.prediction.value] = 'empty'
48-
datafile.loc[:, Column.combined_prediction.value] = 'empty'
49-
datafile.loc[:, Column.PHRED.value] = 0.0
50-
datafile.drop(columns=Column.FeatureID.value, inplace=True)
51-
datafile = datafile[
52-
[Column.chr_pos_ref_alt.value,
53-
Column.GeneName.value,
54-
Column.Consequence.value,
55-
Column.PHRED.value,
56-
Column.probabilities.value,
57-
Column.prediction.value,
58-
Column.combined_prediction.value]
59-
]
60-
return datafile
61-
6236
def export_capice_training_dataset(self, datafile: pd.DataFrame, name: str, feature: str):
6337
"""
6438
Function specific to export a (splitted) dataset comming from the training pathway.

src/main/python/core/input_checker.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ def check_input_output_directories(self, input_path, output_path):
5555
# Then I know it's an output filename
5656
self.output_directory = os.path.dirname(input_path)
5757
self.output_filename = output_path
58-
# self._check_gzip_extension()
58+
self._check_gzip_extension()
5959

6060
def _create_capice_output_filename(self, input_path, output_path=None, append_capice=True, ispath=False):
6161
if output_path is None:

src/main/python/resources/enums/sections.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,3 @@ class Column(Enum):
1414
FeatureID = 'FeatureID'
1515
Consequence = 'Consequence'
1616
probabilities = 'probabilities'
17-
prediction = 'prediction'
18-
combined_prediction = 'combined_prediction'
19-
PHRED = 'PHRED'

src/main/python/resources/preprocessors/preprocessor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ def _raise_no_module_found_error(self):
6565
Specialized function to be used into _load_preprocessors() and _load_correct_preprocessor() to be raised when
6666
no preprocessing files can be found.
6767
"""
68-
error_message = 'No usable python files are found within the imputing directory!'
68+
error_message = 'No usable python files are found within the model directory!'
6969
self.log.critical(error_message)
7070
raise FileNotFoundError(error_message)
7171

src/test/python/core/test_exporter.py

Lines changed: 6 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,6 @@ def setUpClass(cls):
1919
Column.Consequence.value: ['Synonymous', 'Frame-shift'],
2020
Column.probabilities.value: [0.01, 0.998]
2121
})
22-
cls.legacy_export_prediction = pd.DataFrame(
23-
{
24-
Column.chr_pos_ref_alt.value: ['1_100_A_C', '2_200_T_G'],
25-
Column.GeneName.value: ['foo', 'bar'],
26-
Column.Consequence.value: ['Synonymous', 'Frame-shift'],
27-
Column.PHRED.value: [0.0, 0.0],
28-
Column.probabilities.value: [0.01, 0.998],
29-
Column.prediction.value: ['empty', 'empty'],
30-
Column.combined_prediction.value: ['empty', 'empty']
31-
}
32-
)
3322
cls.export_dataset = pd.DataFrame(
3423
{
3524
'chr': [1, 2],
@@ -53,13 +42,8 @@ def test_prediction_output(self):
5342
print('Prediction output')
5443
self.exporter.capice_filename = 'test_output.tsv'
5544
self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe)
56-
exported_data = pd.read_csv(os.path.join(self.output_loc, 'test_output.tsv'), sep='\t')
57-
pd.testing.assert_frame_equal(exported_data, self.legacy_export_prediction)
58-
59-
def test_legacy_conversion(self):
60-
print('Legacy output conversion')
61-
converted_legacy = self.exporter._export_legacy_prediction(datafile=self.prediction_output_dataframe)
62-
pd.testing.assert_frame_equal(converted_legacy, self.legacy_export_prediction)
45+
exported_data = pd.read_csv(os.path.join(self.output_loc, 'test_output.tsv'), compression='gzip', sep='\t')
46+
pd.testing.assert_frame_equal(exported_data, self.prediction_output_dataframe)
6347

6448
def test_dataset_export(self):
6549
print('Dataset export')
@@ -85,8 +69,10 @@ def test_exporter_force(self):
8569
self.exporter.force = True
8670
self.exporter.capice_filename = 'already_present_file.tsv'
8771
self.exporter.export_capice_prediction(datafile=self.prediction_output_dataframe)
88-
forced_file = pd.read_csv(os.path.join(self.output_loc, 'already_present_file.tsv'), sep='\t')
89-
pd.testing.assert_frame_equal(forced_file, self.legacy_export_prediction)
72+
forced_file = pd.read_csv(
73+
os.path.join(self.output_loc, 'already_present_file.tsv'), compression='gzip', sep='\t'
74+
)
75+
pd.testing.assert_frame_equal(forced_file, self.prediction_output_dataframe)
9076

9177

9278
if __name__ == '__main__':

src/test/python/core/test_input_checker.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,7 @@ def test_input_output_conversion_case3(self):
8686
print('Input output conversion (input + output directory + filename)')
8787
test_input = os.path.join('.', 'CAPICE_example', 'test_cadd14_grch37_annotated.tsv.gz')
8888
test_output = os.path.join('.', 'test_output', 'test.txt')
89-
# expected_output_filename = 'test.txt.gz'
90-
expected_output_filename = 'test.txt' # Legacy support, if legacy is disabled can be removed.
89+
expected_output_filename = 'test.txt.gz'
9190
expected_output_directory = os.path.join('.', 'test_output')
9291
self.input_checker.check_input_output_directories(input_path=test_input, output_path=test_output)
9392
self.assertEqual(self.input_checker.get_output_filename(), expected_output_filename)
@@ -97,9 +96,10 @@ def test_input_output_conversion_case4(self):
9796
print('Input output conversion (input + filename)')
9897
test_input = os.path.join('.', 'CAPICE_example', 'test_cadd14_grch37_annotated.tsv.gz')
9998
test_output = 'test.txt'
99+
expected_output_filename = 'test.txt.gz'
100100
expected_output_directory = os.path.join('.', 'CAPICE_example')
101101
self.input_checker.check_input_output_directories(input_path=test_input, output_path=test_output)
102-
self.assertEqual(self.input_checker.get_output_filename(), test_output)
102+
self.assertEqual(self.input_checker.get_output_filename(), expected_output_filename)
103103
self.assertEqual(self.input_checker.get_output_directory(), expected_output_directory)
104104

105105
def test_log_checker_both(self):

src/test/python/test_main_nontrain.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,8 @@ def test_integration_main_nontrain(self):
3636
input_loc=infile,
3737
output_loc=self.output_dir)
3838
main.run()
39-
prediction_output = pd.read_csv(os.path.join(self.output_dir, 'test_output.txt'), sep='\t')
40-
self.assertEqual(prediction_output.shape, (20, 7))
39+
prediction_output = pd.read_csv(os.path.join(self.output_dir, 'test_output.txt'), compression='gzip', sep='\t')
40+
self.assertEqual(prediction_output.shape, (20, 5))
4141

4242

4343
if __name__ == '__main__':

0 commit comments

Comments
 (0)