From 5dfe298c52298409136972dc0092fd37bdcdcd41 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 21 Mar 2024 16:03:32 +0100 Subject: [PATCH 001/130] Add clinical_attribute_meta records to the seed mini To make the dataset look like real data in the database --- src/test/resources/seed_mini.sql | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index e17819cf..3dfd5ff9 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -90,6 +90,15 @@ INSERT INTO `reference_genome` VALUES (2,'human','hg38','GRCh38',3049315783,'htt INSERT INTO "cancer_study" ("CANCER_STUDY_ID","CANCER_STUDY_IDENTIFIER","TYPE_OF_CANCER_ID","NAME","DESCRIPTION","PUBLIC","PMID","CITATION","GROUPS","REFERENCE_GENOME_ID") VALUES (1,'study_tcga_pub','brca','Breast Invasive Carcinoma (TCGA,Nature 2012)','The Cancer Genome Atlas (TCGA) Breast Invasive Carcinoma project. 825 cases.
Nature 2012. Raw data via the TCGA Data Portal.',1,'23000897,26451490','TCGA,Nature 2012,...','SU2C-PI3K;PUBLIC;GDAC',1); +-- clinical_attribute_meta +INSERT INTO "clinical_attribute_meta" +VALUES ('DFS_MONTHS','Disease Free (Months)','Disease free in months since treatment','NUMBER',0,'1',1), +('DFS_STATUS','Disease Free Status','Disease free status','STRING',0,'1',1), +('OS_MONTHS','Overall Survival (Months)','Overall survival in months since diagnosis','NUMBER',0,'1',1), +('OS_STATUS','Overall Survival Status','Overall survival status','STRING',0,'1',1), +('SAMPLE_COUNT','Number of Samples Per Patient','Number of Samples Per Patient','STRING',1,'1',1), +('SUBTYPE','Subtype','Subtype description','STRING',0,'1',1); + -- gene as genetic_entity INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); SET @max_entity_id = (Select MAX(ID) from genetic_entity); From 531b10ae748c10422e3eeea0cf7328dd1c346d57 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 21 Mar 2024 16:05:37 +0100 Subject: [PATCH 002/130] Implement sample attribute rewriting flag --- .../cbio/portal/dao/DaoClinicalData.java | 23 ++- .../portal/scripts/ImportClinicalData.java | 58 +++--- .../ITIncrementalSamplesImport.java | 183 ++++++++++++++++++ .../clinical_data_single_SAMPLE.txt | 6 + .../meta_clinical_sample.txt | 4 + .../clinical_data_single_SAMPLE.txt | 6 + .../meta_clinical_sample.txt | 4 + .../clinical_data_single_SAMPLE.txt | 6 + .../meta_clinical_sample.txt | 4 + 9 files changed, 269 insertions(+), 25 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java create mode 100644 src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt create mode 100644 src/test/resources/incremental/insert_single_tcga_sample/meta_clinical_sample.txt create mode 100644 src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/clinical_data_single_SAMPLE.txt create mode 100644 src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/meta_clinical_sample.txt create mode 100644 src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt create mode 100644 src/test/resources/incremental/update_single_tcga_sample/meta_clinical_sample.txt diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java index f626d9f4..8e8acaf1 100755 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java @@ -50,9 +50,10 @@ public final class DaoClinicalData { public static final String SAMPLE_TABLE = "clinical_sample"; public static final String PATIENT_TABLE = "clinical_patient"; - private static final String SAMPLE_INSERT = "INSERT INTO " + SAMPLE_TABLE + "(`INTERAL_ID`,`ATTR_ID`,`ATTR_VALUE` VALUES(?,?,?)"; - private static final String PATIENT_INSERT = "INSERT INTO " + PATIENT_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE` VALUES(?,?,?)"; + private static final String SAMPLE_INSERT = "INSERT INTO " + SAMPLE_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; + private static final String PATIENT_INSERT = "INSERT INTO " + PATIENT_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; + private static final String SAMPLE_DELETE = "DELETE FROM " + SAMPLE_TABLE + " WHERE `INTERNAL_ID` = ?"; private static final Map sampleAttributes = new HashMap(); private static final Map patientAttributes = new HashMap(); @@ -364,6 +365,24 @@ public static List getSampleData(int cancerStudyId, Collection getSampleData(int cancerStudyId, Collection sampleIds) throws DaoException { List sampleIdsInt = new ArrayList(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java index 11eeedbc..0978354f 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java @@ -61,6 +61,7 @@ public class ImportClinicalData extends ConsoleRunnable { private CancerStudy cancerStudy; private AttributeTypes attributesType; private boolean relaxed; + private boolean overwriteExisting; private Set patientIds = new HashSet(); public static enum MissingAttributeValues @@ -332,19 +333,23 @@ private boolean addDatum(String[] fields, List columnAttrs, M //check if sample is not already added: Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), stableSampleId, false); if (sample != null) { - //this should be a WARNING in case of TCGA studies (see https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415) - //and an ERROR in other studies. I.e. a sample should occur only once in clinical file! - if (stableSampleId.startsWith("TCGA-")) { - ProgressMonitor.logWarning("Sample " + stableSampleId + " found to be duplicated in your file. Only data of the first sample will be processed."); - return false; - } - //give error or warning if sample is already in DB and this is NOT expected (i.e. not supplemental data): - if (!this.isSupplementalData()) { - throw new RuntimeException("Error: Sample " + stableSampleId + " found to be duplicated in your file."); - } - else { - internalSampleId = sample.getInternalId(); - } + if (overwriteExisting) { + internalSampleId = sample.getInternalId(); + DaoClinicalData.removeSampleData(internalSampleId); + } else { + //this should be a WARNING in case of TCGA studies (see https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415) + //and an ERROR in other studies. I.e. a sample should occur only once in clinical file! + if (stableSampleId.startsWith("TCGA-")) { + ProgressMonitor.logWarning("Sample " + stableSampleId + " found to be duplicated in your file. Only data of the first sample will be processed."); + return false; + } + if (this.isSupplementalData()) { + internalSampleId = sample.getInternalId(); + } else { + //give error or warning if sample is already in DB and this is NOT expected (i.e. not supplemental data): + throw new RuntimeException("Error: Sample " + stableSampleId + " found to be duplicated in your file."); + } + } } else { Patient patient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), stablePatientId); @@ -616,12 +621,14 @@ public void run() { "cancer study id").withOptionalArg().describedAs("study").ofType(String.class); OptionSpec attributeFlag = parser.accepts("a", "(deprecated) Flag for using MIXED_ATTRIBUTES").withOptionalArg().describedAs("a").ofType(String.class); - OptionSpec relaxedFlag = parser.accepts("r", + OptionSpec relaxedFlag = parser.accepts("r", "(not recommended) Flag for relaxed mode, determining how to handle detected data harmonization problems in the same study").withOptionalArg().describedAs("r").ofType(String.class); parser.accepts( "loadMode", "direct (per record) or bulk load of data" ) .withOptionalArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); parser.accepts("noprogress", "this option can be given to avoid the messages regarding memory usage and % complete"); - + OptionSpec overWriteExistingFlag = parser.accepts("overwrite-existing", + "Flag that enables re-uploading data for the patient/sample entries that already exist in the database").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); + OptionSet options = null; try { options = parser.parse( args ); @@ -652,15 +659,20 @@ public void run() { attributesDatatype = properties.getProperty("datatype"); cancerStudyStableId = properties.getProperty("cancer_study_identifier"); } - if( options.has ( attributeFlag ) ) - { - attributesDatatype = "MIXED_ATTRIBUTES"; - } - if( options.has ( relaxedFlag ) ) - { - relaxed = true; + if( options.has ( attributeFlag ) ) + { + attributesDatatype = "MIXED_ATTRIBUTES"; + } + if( options.has ( relaxedFlag ) ) + { + relaxed = true; - } + } + if( options.has ( overWriteExistingFlag ) ) + { + overwriteExisting = true; + + } SpringUtil.initDataSource(); CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); if (cancerStudy == null) { diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java new file mode 100644 index 00000000..3d748686 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.scripts.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ClinicalData; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.scripts.ImportClinicalData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.Assert.*; + +/** + * Tests Incremental Import of Sample Clinical Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class ITIncrementalSamplesImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + private final String UPDATE_TCGA_SAMPLE_ID = "TCGA-A1-A0SD-01"; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + /** + * Test inserting new sample for existing patient + */ + @Test + public void testInsertNewSampleForExistingPatient() throws DaoException { + /** + * prepare a new patient without samples + */ + Patient patient = new Patient(cancerStudy, "TEST-INC-TCGA-P1"); + int internalPatientId = DaoPatient.addPatient(patient); + + String newSampleId = "TEST-INC-TCGA-P1-S1"; + File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_single_tcga_sample/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_SAMPLE.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + }); + importClinicalData.run(); + + List samples = DaoSample.getSamplesByPatientId(internalPatientId); + assertEquals("A new sample has to be attached to the patient", 1, samples.size()); + Sample sample = samples.get(0); + assertEquals(newSampleId, sample.getStableId()); + + List clinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), List.of(newSampleId)); + Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "SUBTYPE", "basal-like", + "OS_STATUS", "1:DECEASED", + "OS_MONTHS", "12.34", + "DFS_STATUS", "1:Recurred/Progressed"), sampleAttrs); + } + + /** + * Test inserting new sample for nonexistent patient. + * EXPECTED RESULTS: + * 1. The new patient entry has to be inserted + * 2. Sample and all its clinical attributes have to be inserted + */ + @Test + public void testInsertNewSampleForNonexistentPatient() throws DaoException { + String newPatientId = "TEST-INC-TCGA-P2"; + String newSampleId = "TEST-INC-TCGA-P2-S1"; + File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_SAMPLE.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + }); + importClinicalData.run(); + + Patient newPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), newPatientId); + assertNotNull("The new patient has to be created.", newPatient); + + List samples = DaoSample.getSamplesByPatientId(newPatient.getInternalId()); + assertEquals("A new sample has to be attached to the patient", 1, samples.size()); + Sample sample = samples.get(0); + assertEquals(newSampleId, sample.getStableId()); + + List clinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), List.of(newSampleId)); + Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "SUBTYPE", "Luminal A", + "OS_STATUS", "0:LIVING", + "OS_MONTHS", "23.45", + "DFS_STATUS", "1:Recurred/Progressed", + "DFS_MONTHS", "100"), sampleAttrs); + } + + /** + * Test reloading sample clinical attributes + */ + @Test + public void testReloadSampleClinicalAttributes() throws DaoException { + /** + * Add to a tcga sample some clinical attributes (test data sets doesn't have any) + */ + Sample tcgaSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), + UPDATE_TCGA_SAMPLE_ID); + DaoClinicalData.addSampleDatum(tcgaSample.getInternalId(), "SUBTYPE", "Luminal A"); + DaoClinicalData.addSampleDatum(tcgaSample.getInternalId(), "OS_STATUS", "0:LIVING"); + DaoClinicalData.addSampleDatum(tcgaSample.getInternalId(), "OS_MONTHS", "34.56"); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_single_tcga_sample/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_SAMPLE.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importClinicalData.run(); + + List clinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), List.of(UPDATE_TCGA_SAMPLE_ID)); + Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "OS_STATUS", "1:DECEASED", + "OS_MONTHS", "45.67", + "DFS_STATUS", "1:Recurred/Progressed", + "DFS_MONTHS", "123"), sampleAttrs); + } +} diff --git a/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt new file mode 100644 index 00000000..e8b77aec --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt @@ -0,0 +1,6 @@ +#Patient Identifier Sample Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient Identifier Sample identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TEST-INC-TCGA-P1-S1 TEST-INC-TCGA-P1 basal-like 1:DECEASED 12.34 1:Recurred/Progressed NA diff --git a/src/test/resources/incremental/insert_single_tcga_sample/meta_clinical_sample.txt b/src/test/resources/incremental/insert_single_tcga_sample/meta_clinical_sample.txt new file mode 100644 index 00000000..b0b4753e --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_sample/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_single_SAMPLE.txt diff --git a/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/clinical_data_single_SAMPLE.txt new file mode 100644 index 00000000..1252404b --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/clinical_data_single_SAMPLE.txt @@ -0,0 +1,6 @@ +#Patient Identifier Sample Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient Identifier Sample identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TEST-INC-TCGA-P2-S1 TEST-INC-TCGA-P2 Luminal A 0:LIVING 23.45 1:Recurred/Progressed 100 diff --git a/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/meta_clinical_sample.txt b/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/meta_clinical_sample.txt new file mode 100644 index 00000000..b0b4753e --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_sample_for_nonexistent_patient/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_single_SAMPLE.txt diff --git a/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt new file mode 100644 index 00000000..969f504f --- /dev/null +++ b/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt @@ -0,0 +1,6 @@ +#Patient Identifier Sample Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient Identifier Sample identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-A1-A0SD-01 TCGA-A1-A0SD 1:DECEASED 45.67 1:Recurred/Progressed 123 diff --git a/src/test/resources/incremental/update_single_tcga_sample/meta_clinical_sample.txt b/src/test/resources/incremental/update_single_tcga_sample/meta_clinical_sample.txt new file mode 100644 index 00000000..b0b4753e --- /dev/null +++ b/src/test/resources/incremental/update_single_tcga_sample/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_single_SAMPLE.txt From 248a08c06f75cd638e52ade62829b5f9d41f22f1 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 21 Mar 2024 16:07:24 +0100 Subject: [PATCH 003/130] Add --overwrite-existing for the rest of test cases Apperently, the flag does not change anything. But we add it anyway as the tests for "incremental" data upload. --- .../portal/scripts/incremental/ITIncrementalSamplesImport.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java index 3d748686..77294dc6 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java @@ -92,6 +92,7 @@ public void testInsertNewSampleForExistingPatient() throws DaoException { ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { "--meta", metaFile.getAbsolutePath(), "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", }); importClinicalData.run(); @@ -126,6 +127,7 @@ public void testInsertNewSampleForNonexistentPatient() throws DaoException { ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { "--meta", metaFile.getAbsolutePath(), "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", }); importClinicalData.run(); From 2bc7271014859d50c42336d18e4fd80cf5a16754 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 21 Mar 2024 21:45:12 +0100 Subject: [PATCH 004/130] Test that mutations stay after updating the sample attributes --- .../incremental/ITIncrementalSamplesImport.java | 16 +++++++++++----- .../clinical_data_single_SAMPLE.txt | 2 +- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java index 77294dc6..45ba8e1b 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java @@ -36,10 +36,7 @@ import org.junit.Test; import org.junit.runner.RunWith; import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.CancerStudy; -import org.mskcc.cbio.portal.model.ClinicalData; -import org.mskcc.cbio.portal.model.Patient; -import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.model.*; import org.mskcc.cbio.portal.scripts.ImportClinicalData; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -47,6 +44,7 @@ import org.springframework.transaction.annotation.Transactional; import java.io.File; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -67,7 +65,7 @@ public class ITIncrementalSamplesImport { public static final String STUDY_ID = "study_tcga_pub"; private CancerStudy cancerStudy; - private final String UPDATE_TCGA_SAMPLE_ID = "TCGA-A1-A0SD-01"; + private final String UPDATE_TCGA_SAMPLE_ID = "TCGA-A1-A0SH-01"; @Before public void setUp() throws DaoException { @@ -181,5 +179,13 @@ public void testReloadSampleClinicalAttributes() throws DaoException { "OS_MONTHS", "45.67", "DFS_STATUS", "1:Recurred/Progressed", "DFS_MONTHS", "123"), sampleAttrs); + + /** + * Sub-entries stayed as they were, not removed. + */ + GeneticProfile mutationsProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mutations"); + assertNotNull(mutationsProfile); + ArrayList mutations = DaoMutation.getMutations(mutationsProfile.getGeneticProfileId(), tcgaSample.getInternalId()); + assertEquals(2, mutations.size()); } } diff --git a/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt index 969f504f..5088066f 100644 --- a/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt +++ b/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt @@ -3,4 +3,4 @@ #STRING STRING STRING NUMBER STRING NUMBER #1 1 1 1 1 1 SAMPLE_ID PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS -TCGA-A1-A0SD-01 TCGA-A1-A0SD 1:DECEASED 45.67 1:Recurred/Progressed 123 +TCGA-A1-A0SH-01 TCGA-A1-A0SH 1:DECEASED 45.67 1:Recurred/Progressed 123 From 31e31945079164369a16725aac98ad96fb5023a2 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 21 Mar 2024 21:46:38 +0100 Subject: [PATCH 005/130] Add overwrite-existing support for mutations data --- .../mskcc/cbio/portal/dao/DaoMutation.java | 19 +-- .../scripts/ImportExtendedMutationData.java | 12 +- .../portal/scripts/ImportProfileData.java | 3 +- .../mskcc/cbio/portal/util/ConsoleUtil.java | 3 + .../ITIncrementalMutationsImport.java | 144 ++++++++++++++++++ .../data_mutations_extended.txt | 4 + .../insert_mutation_data/meta_mutations.txt | 8 + .../data_mutations_extended.txt | 4 + .../update_mutation_data/meta_mutations.txt | 8 + 9 files changed, 189 insertions(+), 16 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalMutationsImport.java create mode 100644 src/test/resources/incremental/insert_mutation_data/data_mutations_extended.txt create mode 100644 src/test/resources/incremental/insert_mutation_data/meta_mutations.txt create mode 100644 src/test/resources/incremental/update_mutation_data/data_mutations_extended.txt create mode 100644 src/test/resources/incremental/update_mutation_data/meta_mutations.txt diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java index 8adbdadd..d2f80527 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java @@ -475,18 +475,10 @@ public static ArrayList getMutations (long entrezGeneId, Strin return mutationList; } - /** - * @deprecated We believe that this method is no longer called by any part of the codebase, and it will soon be deleted. - */ - @Deprecated public static ArrayList getMutations (int geneticProfileId, int sampleId) throws DaoException { return getMutations(geneticProfileId, Arrays.asList(Integer.valueOf(sampleId))); } - /** - * @deprecated We believe that this method is no longer called by any part of the codebase, and it will soon be deleted. - */ - @Deprecated public static ArrayList getMutations (int geneticProfileId, List sampleIds) throws DaoException { Connection con = null; PreparedStatement pstmt = null; @@ -1501,19 +1493,18 @@ protected static String boolToStr(boolean value) return value ? "1" : "0"; } - /** - * @deprecated We believe that this method is no longer called by any part of the codebase, and it will soon be deleted. - */ - @Deprecated - public static void deleteAllRecordsInGeneticProfile(long geneticProfileId) throws DaoException { + public static void deleteAllRecordsInGeneticProfileForSample(long geneticProfileId, long internalSampleId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoMutation.class); - pstmt = con.prepareStatement("DELETE from mutation WHERE GENETIC_PROFILE_ID=?"); + pstmt = con.prepareStatement("DELETE from mutation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"); pstmt.setLong(1, geneticProfileId); + pstmt.setLong(2, internalSampleId); pstmt.executeUpdate(); + // TODO Remove row in mutation_event if it does not have mutations left + // TODO Remove profile if no mutations nor mutation_event(s) left } catch (SQLException e) { throw new DaoException(e); } finally { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 39dd97c3..3da90645 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -75,11 +75,17 @@ public class ImportExtendedMutationData{ private Pattern SEQUENCE_SAMPLES_REGEX = Pattern.compile("^.*sequenced_samples:(.*)$"); private final String ASCN_NAMESPACE = "ASCN"; + private final boolean overwriteExisting; + /** * construct an ImportExtendedMutationData. * Filter mutations according to the no argument MutationFilter(). */ public ImportExtendedMutationData(File mutationFile, int geneticProfileId, String genePanel, Set filteredMutations, Set namespaces) { + this(mutationFile, geneticProfileId, genePanel, filteredMutations, namespaces, false); + } + + public ImportExtendedMutationData(File mutationFile, int geneticProfileId, String genePanel, Set filteredMutations, Set namespaces, boolean overwriteExisting) { this.mutationFile = mutationFile; this.geneticProfileId = geneticProfileId; this.swissprotIsAccession = false; @@ -89,6 +95,7 @@ public ImportExtendedMutationData(File mutationFile, int geneticProfileId, Strin // create default MutationFilter myMutationFilter = new MutationFilter( ); this.namespaces = namespaces; + this.overwriteExisting = overwriteExisting; } public ImportExtendedMutationData(File mutationFile, int geneticProfileId, String genePanel) { @@ -150,7 +157,7 @@ public void importData() throws IOException, DaoException { referenceGenome = GlobalProperties.getReferenceGenomeName(); } String genomeBuildName = DaoReferenceGenome.getReferenceGenomeByGenomeName(referenceGenome).getBuildName(); - + Set processedSamples = new HashSet<>(); while((line=buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); @@ -180,6 +187,9 @@ public void importData() throws IOException, DaoException { else { throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(barCode) + "' found in MAF file: " + this.mutationFile.getCanonicalPath()); } + } else if (overwriteExisting && !processedSamples.contains(sample.getInternalId())) { + DaoMutation.deleteAllRecordsInGeneticProfileForSample(geneticProfileId, sample.getInternalId()); + processedSamples.add(sample.getInternalId()); } String validationStatus = record.getValidationStatus(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index 54ce204e..a0ffe297 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -73,6 +73,7 @@ public void run() { if (options.has("update-info") && (((String) options.valueOf("update-info")).equalsIgnoreCase("true") || options.valueOf("update-info").equals("1"))) { updateInfo = true; } + boolean overwriteExisting = options.has("overwrite-existing"); SpringUtil.initDataSource(); ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getAbsolutePath()); // Load genetic profile and gene panel @@ -98,7 +99,7 @@ public void run() { geneticProfile.getGeneticAlterationType() == GeneticAlterationType.MUTATION_UNCALLED) { Set filteredMutations = GeneticProfileReader.getVariantClassificationFilter( descriptorFile ); Set namespaces = GeneticProfileReader.getNamespaces( descriptorFile ); - ImportExtendedMutationData importer = new ImportExtendedMutationData(dataFile, geneticProfile.getGeneticProfileId(), genePanel, filteredMutations, namespaces); + ImportExtendedMutationData importer = new ImportExtendedMutationData(dataFile, geneticProfile.getGeneticProfileId(), genePanel, filteredMutations, namespaces, overwriteExisting); String swissprotIdType = geneticProfile.getOtherMetaDataField("swissprot_identifier"); if (swissprotIdType != null && swissprotIdType.equals("accession")) { importer.setSwissprotIsAccession(true); diff --git a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java index b5b36227..38ea5ad0 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java @@ -251,6 +251,9 @@ public static OptionSet parseStandardDataAndMetaUpdateOptions(String[] args, Str parser.accepts( "loadMode", "direct (per record) or bulk load of data" ) .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); } + parser.accepts("overwrite-existing", + "Flag that enables re-uploading data for the patient/sample entries that already exist in the database").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); + String progName = "importScript"; OptionSet options = null; diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalMutationsImport.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalMutationsImport.java new file mode 100644 index 00000000..bf1f8be0 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalMutationsImport.java @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.scripts.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.ArrayList; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; + +/** + * Tests Incremental Import of Mutation Molecular Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class ITIncrementalMutationsImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + /** + * Test inserting new mutation profile data for existing sample and genetic profile + */ + @Test + public void testInsertNewMutationProfileDataForExistingSampleAndProfile() throws DaoException { + GeneticProfile mutationGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mutations"); + assertNotNull(mutationGeneticProfile); + String mutationDataSampleId = "TCGA-A1-A0SE-01"; + /** + * this sample does not have mutation data attached + */ + Sample mutationDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), mutationDataSampleId); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_mutation_data/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_mutations.txt"); + File dataFile = new File(singleTcgaSampleFolder, "data_mutations_extended.txt"); + + ImportProfileData importProfileData = new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importProfileData.run(); + + ArrayList insertedMutations = getMutations( + mutationGeneticProfile.getGeneticProfileId(), + mutationDataSample.getInternalId()); + assertEquals(3, insertedMutations.size()); + assertNotNull(insertedMutations.get(0).getEvent()); + assertNotNull(insertedMutations.get(1).getEvent()); + assertNotNull(insertedMutations.get(2).getEvent()); + } + /** + * Test updating mutation profile data for existing sample. The mutation genetic profile exists. + */ + @Test + public void testUpdateMutationProfileDataForExistingSampleAndProfile() throws DaoException { + GeneticProfile mutationGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mutations"); + assertNotNull(mutationGeneticProfile); + String mutationDataSampleId = "TCGA-A1-A0SH-01"; + /** + * this sample does have 2 mutation data rows attached. See seed_mini.sql + */ + Sample mutationDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), mutationDataSampleId); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_mutation_data/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_mutations.txt"); + File dataFile = new File(singleTcgaSampleFolder, "data_mutations_extended.txt"); + + ImportProfileData importProfileData = new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importProfileData.run(); + + ArrayList insertedMutations = getMutations( + mutationGeneticProfile.getGeneticProfileId(), + mutationDataSample.getInternalId()); + assertEquals(3, insertedMutations.size()); + assertNotNull(insertedMutations.get(0).getEvent()); + assertNotNull(insertedMutations.get(1).getEvent()); + assertNotNull(insertedMutations.get(2).getEvent()); + Set entrezIds = insertedMutations.stream().map(m -> m.getEntrezGeneId()).collect(Collectors.toSet()); + Set expected = Set.of(207L, 208L, 672L); + assertEquals(expected, entrezIds); + } + +} diff --git a/src/test/resources/incremental/insert_mutation_data/data_mutations_extended.txt b/src/test/resources/incremental/insert_mutation_data/data_mutations_extended.txt new file mode 100644 index 00000000..1eec7202 --- /dev/null +++ b/src/test/resources/incremental/insert_mutation_data/data_mutations_extended.txt @@ -0,0 +1,4 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Tumor_Sample_Barcode Verification_Status Validation_Status Mutation_Status Sequencer Chromosome Start_position End_position Variant_Classification HGVSp_Short MA:FImpact MA:link.MSA MA:link.PDB +AKT1 207 broad.mit.edu GRCh37 TCGA-A1-A0SE-01 Unknown valid Unknown Illumina GAIIx chr1 22078087 22078087 Missense_Mutation D820N neutral mutationassessor.org/?cm=msa&ty=f&p=PGBM_HUMAN&rb=814&re=869&var=D820N +AKT2 208 broad.mit.edu GRCh37 TCGA-A1-A0SE-01 Unknown valid Unknown Illumina GAIIx chr1 34085156 34085156 Missense_Mutation V277I low mutationassessor.org/?cm=msa&ty=f&p=CSMD2_HUMAN&rb=202&re=303&var=V277I mutationassessor.org/pdb.php?prot=CSMD2_HUMAN&from=202&to=303&var=V277I +AKT3 10000 broad.mit.edu GRCh37 TCGA-A1-A0SE-01 Unknown valid Unknown Illumina GAIIx chr1 35989584 35989584 Missense_Mutation F628L mutationassessor.org/?cm=msa&ty=f&p=CLSPN_HUMAN&rb=601&re=800&var=F628L diff --git a/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt b/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt new file mode 100644 index 00000000..37915344 --- /dev/null +++ b/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: MUTATION_EXTENDED +stable_id: mutations +datatype: MAF +show_profile_in_analysis_tab: true +profile_name: Test Mutations +profile_description: Mutation data for testing. +data_filename: data_mutations_extended.txt \ No newline at end of file diff --git a/src/test/resources/incremental/update_mutation_data/data_mutations_extended.txt b/src/test/resources/incremental/update_mutation_data/data_mutations_extended.txt new file mode 100644 index 00000000..e9703211 --- /dev/null +++ b/src/test/resources/incremental/update_mutation_data/data_mutations_extended.txt @@ -0,0 +1,4 @@ +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Tumor_Sample_Barcode Verification_Status Validation_Status Mutation_Status Sequencer Chromosome Start_position End_position Variant_Classification HGVSp_Short MA:FImpact MA:link.MSA MA:link.PDB +AKT1 207 broad.mit.edu GRCh37 TCGA-A1-A0SH-01 Unknown valid Unknown Illumina GAIIx chr1 22078087 22078087 Missense_Mutation D820N neutral mutationassessor.org/?cm=msa&ty=f&p=PGBM_HUMAN&rb=814&re=869&var=D820N +AKT2 208 broad.mit.edu GRCh37 TCGA-A1-A0SH-01 Unknown valid Unknown Illumina GAIIx chr1 34085156 34085156 Missense_Mutation V277I low mutationassessor.org/?cm=msa&ty=f&p=CSMD2_HUMAN&rb=202&re=303&var=V277I mutationassessor.org/pdb.php?prot=CSMD2_HUMAN&from=202&to=303&var=V277I +BRCA1 672 broad.mit.edu GRCh37 TCGA-A1-A0SH-01 Unknown valid Unknown Illumina GAIIx chr17 35989584 35989584 Missense_Mutation F628L mutationassessor.org/?cm=msa&ty=f&p=CLSPN_HUMAN&rb=601&re=800&var=F628L diff --git a/src/test/resources/incremental/update_mutation_data/meta_mutations.txt b/src/test/resources/incremental/update_mutation_data/meta_mutations.txt new file mode 100644 index 00000000..37915344 --- /dev/null +++ b/src/test/resources/incremental/update_mutation_data/meta_mutations.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: MUTATION_EXTENDED +stable_id: mutations +datatype: MAF +show_profile_in_analysis_tab: true +profile_name: Test Mutations +profile_description: Mutation data for testing. +data_filename: data_mutations_extended.txt \ No newline at end of file From bd023a9dda9d340fccb28386234416199342f9ef Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 22 Mar 2024 14:25:48 +0100 Subject: [PATCH 006/130] Fix --overwirte-existing flag description for importer of profile data --- src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java index 38ea5ad0..c7fafb92 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java @@ -252,7 +252,7 @@ public static OptionSet parseStandardDataAndMetaUpdateOptions(String[] args, Str .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); } parser.accepts("overwrite-existing", - "Flag that enables re-uploading data for the patient/sample entries that already exist in the database").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); + "Flag that enables re-uploading molecular data that already exist (the same profile and sample id) in the database.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); String progName = "importScript"; From c49bbf3cb40427be27e01001906301c6e0da31b2 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 28 Mar 2024 15:16:46 +0100 Subject: [PATCH 007/130] Add loader command to update case list with sample ids adding to the all case list and case list specified with command arguments is supported --- .../mskcc/cbio/portal/dao/DaoSampleList.java | 77 ++++---- .../scripts/UpdateCaseListsSampleIds.java | 183 ++++++++++++++++++ .../ITUpdateCaseListsSampleIds.java | 161 +++++++++++++++ .../clinical_data_single_SAMPLE.txt | 6 + .../meta_clinical_sample.txt | 4 + 5 files changed, 390 insertions(+), 41 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java create mode 100644 src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java create mode 100644 src/test/resources/update_case_lists/add_sample_to_case_list/clinical_data_single_SAMPLE.txt create mode 100644 src/test/resources/update_case_lists/add_sample_to_case_list/meta_clinical_sample.txt diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java index c3200389..e62540f7 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java @@ -61,8 +61,15 @@ public int addSampleList(SampleList sampleList) throws DaoException { pstmt.setString(4, sampleList.getSampleListCategory().getCategory()); pstmt.setString(5, sampleList.getDescription()); rows = pstmt.executeUpdate(); - int listListRow = addSampleListList(sampleList, con); - rows = (listListRow != -1) ? (rows + listListRow) : rows; + try (ResultSet generatedKey = pstmt.getGeneratedKeys()) { + if (generatedKey.next()) { + int listId = generatedKey.getInt(1); + int listListRow = addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con, false); + rows = (listListRow != -1) ? (rows + listListRow) : rows; + } else { + throw new SQLException("Creating sample list failed, no ID obtained."); + } + } } catch (SQLException e) { throw new DaoException(e); } finally { @@ -204,41 +211,12 @@ public void deleteAllRecords() throws DaoException { } } - /** - * Given a patient list, gets list id from sample_list table - */ - private int getSampleListId(SampleList sampleList) throws DaoException { - Connection con = null; - PreparedStatement pstmt = null; - ResultSet rs = null; - try { - con = JdbcUtil.getDbConnection(DaoSampleList.class); - pstmt = con.prepareStatement("SELECT LIST_ID FROM sample_list WHERE STABLE_ID=?"); - pstmt.setString(1, sampleList.getStableId()); - rs = pstmt.executeQuery(); - if (rs.next()) { - return rs.getInt("LIST_ID"); - } - return -1; - } catch (SQLException e) { - throw new DaoException(e); - } finally { - JdbcUtil.closeAll(DaoSampleList.class, con, pstmt, rs); - } - } - /** * Adds record to sample_list_list. */ - private int addSampleListList(SampleList sampleList, Connection con) throws DaoException { + private int addSampleListList(int cancerStudyId, int sampleListId, List sampleList, Connection con, boolean ignoreDuplicates) throws DaoException { - // get patient list id - int sampleListId = getSampleListId(sampleList); - if (sampleListId == -1) { - return -1; - } - - if (sampleList.getSampleList().isEmpty()) { + if (sampleList.isEmpty()) { return 0; } @@ -246,25 +224,27 @@ private int addSampleListList(SampleList sampleList, Connection con) throws DaoE ResultSet rs = null; int skippedPatients = 0; try { - StringBuilder sql = new StringBuilder("INSERT INTO sample_list_list (`LIST_ID`, `SAMPLE_ID`) VALUES "); + StringBuilder sql = new StringBuilder("INSERT "); + if (ignoreDuplicates) { + sql.append("IGNORE "); + } + sql.append("INTO sample_list_list (`LIST_ID`, `SAMPLE_ID`) VALUES "); // NOTE - as of 12/12/14, patient lists contain sample ids - for (String sampleId : sampleList.getSampleList()) { - Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(sampleList.getCancerStudyId(), sampleId); + for (String sampleId : sampleList) { + Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudyId, sampleId); if (sample == null) { - System.out.println("null sample: " + sampleId + ":" + sampleList.getStableId()); + System.out.println("null sample: " + sampleId); ++skippedPatients; continue; } sql.append("('").append(sampleListId).append("','").append(sample.getInternalId()).append("'),"); } - if (skippedPatients == sampleList.getSampleList().size()) { + if (skippedPatients == sampleList.size()) { return 0; } sql.deleteCharAt(sql.length()-1); pstmt = con.prepareStatement(sql.toString()); return pstmt.executeUpdate(); - } catch (NullPointerException e) { - throw new DaoException(e); } catch (SQLException e) { throw new DaoException(e); } finally { @@ -272,6 +252,20 @@ private int addSampleListList(SampleList sampleList, Connection con) throws DaoE } } + public int updateSampleListList(SampleList sampleList) throws DaoException { + Connection con = null; + try { + con = JdbcUtil.getDbConnection(DaoSampleList.class); + + return addSampleListList(sampleList.getCancerStudyId(), sampleList.getSampleListId(), sampleList.getSampleList(), con, true); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoSampleList.class, con, null); + } + } + + /** * Given a patient list object (thus patient list id) gets patient list list. */ @@ -287,7 +281,8 @@ private ArrayList getSampleListList(SampleList sampleList, Connection co ArrayList patientIds = new ArrayList(); while (rs.next()) { // NOTE - as of 12/12/14, patient lists contain sample ids - Sample sample = DaoSample.getSampleById(rs.getInt("SAMPLE_ID")); + int sample_id = rs.getInt("SAMPLE_ID"); + Sample sample = DaoSample.getSampleById(sample_id); patientIds.add(sample.getStableId()); } return patientIds; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java new file mode 100644 index 00000000..0616ef74 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.scripts; + +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.model.SampleList; + +import java.io.*; +import java.util.ArrayList; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Set; + +public class UpdateCaseListsSampleIds extends ConsoleRunnable { + + private File metaFile; + private File dataFile; + private Set addToCaseListsStableIds = Set.of(); + private String cancerStudyStableId; + private LinkedHashSet sampleIds; + private DaoSampleList daoSampleList = new DaoSampleList(); + + public UpdateCaseListsSampleIds(String[] args) { + super(args); + } + + /** + * Imports clinical data and clinical attributes (from the worksheet) + */ + public void run() { + parseArguments(); + readStudyIdAndDataFileFromMetaFile(); + readSampleIdsFromDataFile(); + updateCaseLists(); + } + + private void updateCaseLists() { + // TODO Do we really have to do this? Is there a better way? + DaoCancerStudy.reCacheAll(); + try { + Set addSamplesToTheCaseListsStableIds = new LinkedHashSet<>(this.addToCaseListsStableIds); + // TODO has the all case list always to exist? + String allCaseListStableId = this.cancerStudyStableId + "_all"; + // we always add sample to the all case list + addSamplesToTheCaseListsStableIds.add(allCaseListStableId); + for (String caseListStableId: addSamplesToTheCaseListsStableIds) { + SampleList sampleList = daoSampleList.getSampleListByStableId(caseListStableId); + if (sampleList == null) { + throw new RuntimeException("No case list with " + caseListStableId + " stable id is found"); + } + LinkedHashSet newCaseListSampleIds = new LinkedHashSet<>(this.sampleIds); + newCaseListSampleIds.addAll(sampleList.getSampleList()); + ArrayList newSampleArrayList = new ArrayList<>(newCaseListSampleIds); + sampleList.setSampleList(newSampleArrayList); + //TODO no need to run expensive db update if sampleList hasn't effectively changed + daoSampleList.updateSampleListList(sampleList); + } + } catch (DaoException e) { + throw new RuntimeException(e); + } + } + + private void readSampleIdsFromDataFile() { + this.sampleIds = new LinkedHashSet<>(); + FileReader reader = null; + try { + reader = new FileReader(this.dataFile); + try (BufferedReader buff = new BufferedReader(reader)) { + String line; + int sampleIdPosition = -1; + while ((line = buff.readLine()) != null) { + String trimmedLine = line.trim(); + if (trimmedLine.isEmpty() || trimmedLine.startsWith("#")) { + continue; + } + + String[] fieldValues = line.split("\t"); + if (sampleIdPosition == -1) { + sampleIdPosition = List.of(fieldValues).indexOf("SAMPLE_ID"); + if (sampleIdPosition == -1) { + throw new RuntimeException("No SAMPLE_ID header is found"); + } + } else { + sampleIds.add(fieldValues[sampleIdPosition].trim()); + } + } + } + } catch (Exception e) { + throw new RuntimeException(e); + } finally { + if (reader != null) { + try { + reader.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + } + } + + private void readStudyIdAndDataFileFromMetaFile() { + TrimmedProperties properties = new TrimmedProperties(); + try { + FileInputStream inStream = new FileInputStream(this.metaFile); + properties.load(inStream); + this.cancerStudyStableId = properties.getProperty("cancer_study_identifier"); + String dataFilename = properties.getProperty("data_filename"); + this.dataFile = new File(metaFile.getParent(), dataFilename); + } catch (Exception e) { + throw new RuntimeException(e); + } + } + + private void parseArguments() { + String progName = getClass().getName(); + String description = "Updates (adds/removes) sample ids in specified case lists."; + + OptionParser parser = new OptionParser(); + OptionSpec meta = parser.accepts( "meta", + "clinical sample (genetic_alteration_type=CLINICAL and datatype=SAMPLE_ATTRIBUTES) meta data file" ).withRequiredArg().required().describedAs( "meta_clinical_sample.txt" ).ofType( String.class ); + OptionSpec addToCaseLists = parser.accepts( "add-to-case-lists", + "comma-separated list of case list stable ids to add sample ids found in the data file" ).withRequiredArg().describedAs( "study_id_mrna,study_id_sequenced" ).ofType( String.class ); + + try { + OptionSet options = parser.parse( args ); + this.metaFile = new File(options.valueOf(meta)); + if(options.has(addToCaseLists)){ + this.addToCaseListsStableIds = new LinkedHashSet<>(List.of(options.valueOf(addToCaseLists).split(","))); + } + } catch (OptionException e) { + throw new UsageException( + progName, description, parser, + e.getMessage()); + } + } + + /** + * Runs the command as a script and exits with an appropriate exit code. + * + * @param args the arguments given on the command line + */ + public static void main(String[] args) { + ConsoleRunnable runner = new UpdateCaseListsSampleIds(args); + runner.runInConsole(); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java new file mode 100644 index 00000000..3a609c78 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.scripts.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.List; + +import static org.junit.Assert.*; + +/** + * Tests Incremental Import of Case Lists. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class ITUpdateCaseListsSampleIds { + + DaoSampleList daoSampleList = new DaoSampleList(); + /** + * Test adding sample id to the all case list. It is the default behaviour of the command. + */ + @Test + public void testAddSampleIdToAllCaseList() throws DaoException { + String sampleIdToAdd = "TCGA-XX-0800-01"; + String allCaseListStableId = "study_tcga_pub_all"; + File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/add_sample_to_case_list/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + + assertSampleIdNotInCaseLists(sampleIdToAdd, "study_tcga_pub_all"); + + UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { + "--meta", metaFile.getAbsolutePath(), + }); + importClinicalData.run(); + + assertSampleIdInCaseLists(sampleIdToAdd, "study_tcga_pub_all"); + } + + /** + * Test adding sample id to a MRNA case list. + * Sample has to be added to the all case list as well. + */ + @Test + public void testAddSampleIdToMrnaCaseList() throws DaoException { + String sampleIdToAdd = "TCGA-XX-0800-01"; + File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/add_sample_to_case_list/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + + assertSampleIdNotInCaseLists(sampleIdToAdd, "study_tcga_pub_all", "study_tcga_pub_mrna"); + + UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--add-to-case-lists", "study_tcga_pub_mrna" + }); + importClinicalData.run(); + + assertSampleIdInCaseLists(sampleIdToAdd, "study_tcga_pub_all", "study_tcga_pub_mrna"); + } + + /** + * Test re-adding sample to very same case list (efficiently no-op) should not complain. + */ + @Test + public void testReAddingSampleToTheSameListShouldWork() throws DaoException { + String sampleIdToAdd = "TCGA-A1-A0SH-01"; + String[] caseListsSampleIsPartOf = new String[] { + "study_tcga_pub_all", + "study_tcga_pub_acgh", + "study_tcga_pub_cnaseq", + "study_tcga_pub_complete", + "study_tcga_pub_log2CNA", + "study_tcga_pub_mrna", + "study_tcga_pub_sequenced"}; + String[] caseListsSampleIsNotPartOf = new String[] { + "study_tcga_pub_methylation_hm27", + }; + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_single_tcga_sample/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + + assertSampleIdInCaseLists(sampleIdToAdd, caseListsSampleIsPartOf); + assertSampleIdNotInCaseLists(sampleIdToAdd, caseListsSampleIsNotPartOf); + + UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--add-to-case-lists", String.join(",", caseListsSampleIsPartOf) + }); + importClinicalData.run(); + + assertSampleIdInCaseLists(sampleIdToAdd, caseListsSampleIsPartOf); + assertSampleIdNotInCaseLists(sampleIdToAdd, caseListsSampleIsNotPartOf); + } + + @Before + public void init() { + // FIXME How we can remove this re-caching and keep tests to work? + // pre conditions (asserts before the testee operation is called) are relying on it + DaoCancerStudy.reCacheAll(); + } + + private void assertSampleIdInCaseLists(String sampleId, String... caseListStableIds) throws DaoException { + for (String caseListStableId : caseListStableIds) { + SampleList sampleList = daoSampleList.getSampleListByStableId(caseListStableId); + assertNotNull(caseListStableId + " case list has to exist", sampleList); + assertTrue(sampleId + " has to be in the " + caseListStableId + " case list", sampleList.getSampleList().contains(sampleId)); + }; + } + + private void assertSampleIdNotInCaseLists(String sampleId, String... caseListStableIds) throws DaoException { + for (String caseListStableId : caseListStableIds) { + SampleList sampleList = daoSampleList.getSampleListByStableId(caseListStableId); + assertNotNull(caseListStableId + " case list has to exist", sampleList); + assertTrue(sampleId + " has not to be in the " + caseListStableId + " case list", !sampleList.getSampleList().contains(sampleId)); + }; + } +} diff --git a/src/test/resources/update_case_lists/add_sample_to_case_list/clinical_data_single_SAMPLE.txt b/src/test/resources/update_case_lists/add_sample_to_case_list/clinical_data_single_SAMPLE.txt new file mode 100644 index 00000000..600f753e --- /dev/null +++ b/src/test/resources/update_case_lists/add_sample_to_case_list/clinical_data_single_SAMPLE.txt @@ -0,0 +1,6 @@ +#Patient Identifier Sample Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient Identifier Sample identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-XX-0800-01 TCGA-XX-0800 1:DECEASED 45.67 1:Recurred/Progressed 123 diff --git a/src/test/resources/update_case_lists/add_sample_to_case_list/meta_clinical_sample.txt b/src/test/resources/update_case_lists/add_sample_to_case_list/meta_clinical_sample.txt new file mode 100644 index 00000000..b0b4753e --- /dev/null +++ b/src/test/resources/update_case_lists/add_sample_to_case_list/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_single_SAMPLE.txt From 1f5695de8caa5a5a55c16029c541837d032a1cec Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 28 Mar 2024 17:16:12 +0100 Subject: [PATCH 008/130] Add option to remove sample ids from the remaining case lists From case lists that is not _all case list and not specified with --add-to-case-lists option --- .../mskcc/cbio/portal/dao/DaoSampleList.java | 20 ++++++------- .../scripts/UpdateCaseListsSampleIds.java | 29 +++++++++++++++---- .../ITUpdateCaseListsSampleIds.java | 26 ++++++++++++++++- 3 files changed, 59 insertions(+), 16 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java index e62540f7..09e05035 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java @@ -64,7 +64,7 @@ public int addSampleList(SampleList sampleList) throws DaoException { try (ResultSet generatedKey = pstmt.getGeneratedKeys()) { if (generatedKey.next()) { int listId = generatedKey.getInt(1); - int listListRow = addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con, false); + int listListRow = addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con); rows = (listListRow != -1) ? (rows + listListRow) : rows; } else { throw new SQLException("Creating sample list failed, no ID obtained."); @@ -214,7 +214,7 @@ public void deleteAllRecords() throws DaoException { /** * Adds record to sample_list_list. */ - private int addSampleListList(int cancerStudyId, int sampleListId, List sampleList, Connection con, boolean ignoreDuplicates) throws DaoException { + private int addSampleListList(int cancerStudyId, int sampleListId, List sampleList, Connection con) throws DaoException { if (sampleList.isEmpty()) { return 0; @@ -224,11 +224,7 @@ private int addSampleListList(int cancerStudyId, int sampleListId, List ResultSet rs = null; int skippedPatients = 0; try { - StringBuilder sql = new StringBuilder("INSERT "); - if (ignoreDuplicates) { - sql.append("IGNORE "); - } - sql.append("INTO sample_list_list (`LIST_ID`, `SAMPLE_ID`) VALUES "); + StringBuilder sql = new StringBuilder("INSERT INTO sample_list_list (`LIST_ID`, `SAMPLE_ID`) VALUES "); // NOTE - as of 12/12/14, patient lists contain sample ids for (String sampleId : sampleList) { Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudyId, sampleId); @@ -252,16 +248,20 @@ private int addSampleListList(int cancerStudyId, int sampleListId, List } } - public int updateSampleListList(SampleList sampleList) throws DaoException { + public void updateSampleListList(SampleList sampleList) throws DaoException { Connection con = null; + PreparedStatement pstmt = null; try { con = JdbcUtil.getDbConnection(DaoSampleList.class); + pstmt = con.prepareStatement("DELETE FROM sample_list_list WHERE `LIST_ID` = ?"); + pstmt.setInt(1, sampleList.getSampleListId()); + pstmt.executeUpdate(); - return addSampleListList(sampleList.getCancerStudyId(), sampleList.getSampleListId(), sampleList.getSampleList(), con, true); + addSampleListList(sampleList.getCancerStudyId(), sampleList.getSampleListId(), sampleList.getSampleList(), con); } catch (SQLException e) { throw new DaoException(e); } finally { - JdbcUtil.closeAll(DaoSampleList.class, con, null); + JdbcUtil.closeAll(DaoSampleList.class, con, pstmt, null); } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index 0616ef74..6bf0b28c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -38,15 +38,13 @@ import joptsimple.OptionSpec; import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.dao.DaoSample; import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.model.CancerStudy; import org.mskcc.cbio.portal.model.SampleList; import java.io.*; -import java.util.ArrayList; -import java.util.LinkedHashSet; -import java.util.List; -import java.util.Set; +import java.util.*; +import java.util.stream.Collectors; public class UpdateCaseListsSampleIds extends ConsoleRunnable { @@ -56,6 +54,7 @@ public class UpdateCaseListsSampleIds extends ConsoleRunnable { private String cancerStudyStableId; private LinkedHashSet sampleIds; private DaoSampleList daoSampleList = new DaoSampleList(); + private boolean removeFromRemainingStudyCaseLists = false; public UpdateCaseListsSampleIds(String[] args) { super(args); @@ -92,6 +91,20 @@ private void updateCaseLists() { //TODO no need to run expensive db update if sampleList hasn't effectively changed daoSampleList.updateSampleListList(sampleList); } + if (this.removeFromRemainingStudyCaseLists) { + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(this.cancerStudyStableId); + List sampleLists = daoSampleList.getAllSampleLists(cancerStudy.getInternalId()); + List remainingLists = sampleLists.stream().filter(sl -> + !addSamplesToTheCaseListsStableIds.contains(sl.getStableId()) && sl.getSampleList().stream().anyMatch(this.sampleIds::contains) + ).collect(Collectors.toList()); + for (SampleList remainingList: remainingLists) { + ArrayList newSampleList = new ArrayList<>(remainingList.getSampleList()); + newSampleList.removeAll(this.sampleIds); + remainingList.setSampleList(newSampleList); + //TODO for optimization purpose we could supply to the update method 2 set of samples: samples that have to be added and samples that have to be removed + daoSampleList.updateSampleListList(remainingList); + } + } } catch (DaoException e) { throw new RuntimeException(e); } @@ -153,10 +166,13 @@ private void parseArguments() { String description = "Updates (adds/removes) sample ids in specified case lists."; OptionParser parser = new OptionParser(); + //TODO Do we want to have --sample-ids option instead to make command more flexible which samples we want to add to a given profile? OptionSpec meta = parser.accepts( "meta", "clinical sample (genetic_alteration_type=CLINICAL and datatype=SAMPLE_ATTRIBUTES) meta data file" ).withRequiredArg().required().describedAs( "meta_clinical_sample.txt" ).ofType( String.class ); OptionSpec addToCaseLists = parser.accepts( "add-to-case-lists", "comma-separated list of case list stable ids to add sample ids found in the data file" ).withRequiredArg().describedAs( "study_id_mrna,study_id_sequenced" ).ofType( String.class ); + final String removeFromRemainingStudyCaseListsOption = "remove-from-remaining-study-case-lists"; + parser.accepts( removeFromRemainingStudyCaseListsOption, "Enable removing sample ids from the remaining case lists that is not _all case list and that were not specified with add-to-case-lists"); try { OptionSet options = parser.parse( args ); @@ -164,6 +180,9 @@ private void parseArguments() { if(options.has(addToCaseLists)){ this.addToCaseListsStableIds = new LinkedHashSet<>(List.of(options.valueOf(addToCaseLists).split(","))); } + if(options.has(removeFromRemainingStudyCaseListsOption)) { + this.removeFromRemainingStudyCaseLists = true; + } } catch (OptionException e) { throw new UsageException( progName, description, parser, diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java index 3a609c78..e72671b8 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java @@ -44,7 +44,6 @@ import org.springframework.transaction.annotation.Transactional; import java.io.File; -import java.util.List; import static org.junit.Assert.*; @@ -136,6 +135,31 @@ public void testReAddingSampleToTheSameListShouldWork() throws DaoException { assertSampleIdNotInCaseLists(sampleIdToAdd, caseListsSampleIsNotPartOf); } + /** + * Test removing sample ids from not specified case lists + */ + @Test + public void testRemovingSampleIdsFromNotSpecifiedCaseLists() throws DaoException { + String sampleIdToAdd = "TCGA-A1-A0SH-01"; + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_single_tcga_sample/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + + UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--add-to-case-lists", "study_tcga_pub_acgh", + "--remove-from-remaining-study-case-lists" + }); + importClinicalData.run(); + + assertSampleIdInCaseLists(sampleIdToAdd, "study_tcga_pub_all", "study_tcga_pub_acgh"); + assertSampleIdNotInCaseLists(sampleIdToAdd, "study_tcga_pub_cnaseq", + "study_tcga_pub_complete", + "study_tcga_pub_log2CNA", + "study_tcga_pub_methylation_hm27", + "study_tcga_pub_mrna", + "study_tcga_pub_sequenced"); + } @Before public void init() { // FIXME How we can remove this re-caching and keep tests to work? From 77cd6a872fb10908a5e9d1518296cc889aca2caa Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 29 Mar 2024 13:06:25 +0100 Subject: [PATCH 009/130] Make removing sample ids from not mentioned case lists a default behaviour --- .../scripts/UpdateCaseListsSampleIds.java | 30 +++++++------------ .../ITUpdateCaseListsSampleIds.java | 1 - 2 files changed, 11 insertions(+), 20 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index 6bf0b28c..5ed9599a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -54,7 +54,6 @@ public class UpdateCaseListsSampleIds extends ConsoleRunnable { private String cancerStudyStableId; private LinkedHashSet sampleIds; private DaoSampleList daoSampleList = new DaoSampleList(); - private boolean removeFromRemainingStudyCaseLists = false; public UpdateCaseListsSampleIds(String[] args) { super(args); @@ -91,19 +90,17 @@ private void updateCaseLists() { //TODO no need to run expensive db update if sampleList hasn't effectively changed daoSampleList.updateSampleListList(sampleList); } - if (this.removeFromRemainingStudyCaseLists) { - CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(this.cancerStudyStableId); - List sampleLists = daoSampleList.getAllSampleLists(cancerStudy.getInternalId()); - List remainingLists = sampleLists.stream().filter(sl -> - !addSamplesToTheCaseListsStableIds.contains(sl.getStableId()) && sl.getSampleList().stream().anyMatch(this.sampleIds::contains) - ).collect(Collectors.toList()); - for (SampleList remainingList: remainingLists) { - ArrayList newSampleList = new ArrayList<>(remainingList.getSampleList()); - newSampleList.removeAll(this.sampleIds); - remainingList.setSampleList(newSampleList); - //TODO for optimization purpose we could supply to the update method 2 set of samples: samples that have to be added and samples that have to be removed - daoSampleList.updateSampleListList(remainingList); - } + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(this.cancerStudyStableId); + List sampleLists = daoSampleList.getAllSampleLists(cancerStudy.getInternalId()); + List remainingLists = sampleLists.stream().filter(sl -> + !addSamplesToTheCaseListsStableIds.contains(sl.getStableId()) && sl.getSampleList().stream().anyMatch(this.sampleIds::contains) + ).collect(Collectors.toList()); + for (SampleList remainingList: remainingLists) { + ArrayList newSampleList = new ArrayList<>(remainingList.getSampleList()); + newSampleList.removeAll(this.sampleIds); + remainingList.setSampleList(newSampleList); + //TODO for optimization purpose we could supply to the update method 2 set of samples: samples that have to be added and samples that have to be removed + daoSampleList.updateSampleListList(remainingList); } } catch (DaoException e) { throw new RuntimeException(e); @@ -171,8 +168,6 @@ private void parseArguments() { "clinical sample (genetic_alteration_type=CLINICAL and datatype=SAMPLE_ATTRIBUTES) meta data file" ).withRequiredArg().required().describedAs( "meta_clinical_sample.txt" ).ofType( String.class ); OptionSpec addToCaseLists = parser.accepts( "add-to-case-lists", "comma-separated list of case list stable ids to add sample ids found in the data file" ).withRequiredArg().describedAs( "study_id_mrna,study_id_sequenced" ).ofType( String.class ); - final String removeFromRemainingStudyCaseListsOption = "remove-from-remaining-study-case-lists"; - parser.accepts( removeFromRemainingStudyCaseListsOption, "Enable removing sample ids from the remaining case lists that is not _all case list and that were not specified with add-to-case-lists"); try { OptionSet options = parser.parse( args ); @@ -180,9 +175,6 @@ private void parseArguments() { if(options.has(addToCaseLists)){ this.addToCaseListsStableIds = new LinkedHashSet<>(List.of(options.valueOf(addToCaseLists).split(","))); } - if(options.has(removeFromRemainingStudyCaseListsOption)) { - this.removeFromRemainingStudyCaseLists = true; - } } catch (OptionException e) { throw new UsageException( progName, description, parser, diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java index e72671b8..4564386a 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java @@ -148,7 +148,6 @@ public void testRemovingSampleIdsFromNotSpecifiedCaseLists() throws DaoException UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { "--meta", metaFile.getAbsolutePath(), "--add-to-case-lists", "study_tcga_pub_acgh", - "--remove-from-remaining-study-case-lists" }); importClinicalData.run(); From bd8c4b2b17b75a5b52a27836b7465f941d4b37ed Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 29 Mar 2024 14:44:22 +0100 Subject: [PATCH 010/130] Make update case list command to read case lists files --- .../scripts/UpdateCaseListsSampleIds.java | 100 +++++++++++++----- .../ITUpdateCaseListsSampleIds.java | 15 +-- .../case_lists/case_mrna.txt | 3 + .../case_lists/case_acgh.txt | 3 + .../case_lists/case_cnaseq.txt | 3 + .../case_lists/case_complete.txt | 3 + .../case_lists/case_log2CNA.txt | 3 + .../case_lists/case_mrna.txt | 3 + .../case_lists/case_sequenced.txt | 3 + .../clinical_data_sample.txt | 7 ++ .../meta_clinical_sample.txt | 4 + 11 files changed, 115 insertions(+), 32 deletions(-) create mode 100644 src/test/resources/update_case_lists/add_sample_to_case_list/case_lists/case_mrna.txt create mode 100644 src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_acgh.txt create mode 100644 src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_cnaseq.txt create mode 100644 src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_complete.txt create mode 100644 src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_log2CNA.txt create mode 100644 src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_mrna.txt create mode 100644 src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_sequenced.txt create mode 100644 src/test/resources/update_case_lists/update_tcga_samples/clinical_data_sample.txt create mode 100644 src/test/resources/update_case_lists/update_tcga_samples/meta_clinical_sample.txt diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index 5ed9599a..390398dc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -50,10 +50,11 @@ public class UpdateCaseListsSampleIds extends ConsoleRunnable { private File metaFile; private File dataFile; - private Set addToCaseListsStableIds = Set.of(); + private List caseListFiles = List.of(); private String cancerStudyStableId; - private LinkedHashSet sampleIds; + private Map> caseListSampleIdToSampleIds = new LinkedHashMap<>(); private DaoSampleList daoSampleList = new DaoSampleList(); + private LinkedHashSet allSampleIds; public UpdateCaseListsSampleIds(String[] args) { super(args); @@ -65,25 +66,64 @@ public UpdateCaseListsSampleIds(String[] args) { public void run() { parseArguments(); readStudyIdAndDataFileFromMetaFile(); - readSampleIdsFromDataFile(); - updateCaseLists(); + this.allSampleIds = readSampleIdsFromDataFile(this.dataFile); + // TODO has the all case list always to exist? + this.caseListSampleIdToSampleIds.put(cancerStudyStableId + "_all", this.allSampleIds); + Map> readCaseListSampleIds = readCaseListFiles(); + this.caseListSampleIdToSampleIds.putAll(readCaseListSampleIds); + updateCaseLists(this.caseListSampleIdToSampleIds); } - private void updateCaseLists() { + private Map> readCaseListFiles() { + LinkedHashMap> result = new LinkedHashMap<>(); + for (File caseListFile: this.caseListFiles) { + Properties properties = new TrimmedProperties(); + try { + properties.load(new FileReader(caseListFile)); + } catch (IOException e) { + throw new RuntimeException(e); + } + String studyId = properties.getProperty("cancer_study_identifier"); + if (studyId == null || studyId.trim().equals("")) { + throw new RuntimeException(caseListFile.getAbsolutePath() + ": No cancer_study_identifier specified."); + } + if (!studyId.equals(this.cancerStudyStableId)) { + throw new RuntimeException(caseListFile.getAbsolutePath() + ": cancer_study_identifier expected to be " + this.cancerStudyStableId + " but found to be " + studyId); + } + String caseListStableId = properties.getProperty("stable_id"); + if (caseListStableId == null || caseListStableId.trim().equals("")) { + throw new RuntimeException(caseListFile.getAbsolutePath() + ": No stable_id specified."); + } + String caseListSampleIds = properties.getProperty("case_list_ids"); + if (caseListSampleIds == null || caseListSampleIds.trim().equals("")) { + throw new RuntimeException(caseListFile.getAbsolutePath() + ": No case_list_ids specified."); + } + Set sampleIds = Arrays.stream(caseListSampleIds.split("\t")).map(sampleId -> sampleId.trim()).filter(sampleId -> !"".equals(sampleId.trim())).collect(Collectors.toSet()); + if (sampleIds.isEmpty()) { + throw new RuntimeException(caseListFile.getAbsolutePath() + ": No sample ids specified."); + } + LinkedHashSet extraSampleIds = new LinkedHashSet<>(sampleIds); + extraSampleIds.removeAll(this.allSampleIds); + if (!extraSampleIds.isEmpty()) { + throw new RuntimeException(caseListFile.getAbsolutePath() + ": The following sample ids present in the case list file, but not specified in the clinical sample file: " + String.join(", ", extraSampleIds)); + } + result.put(caseListStableId, sampleIds); + } + return result; + } + + private void updateCaseLists(Map> caseListSampleIdToSampleIds) { // TODO Do we really have to do this? Is there a better way? DaoCancerStudy.reCacheAll(); try { - Set addSamplesToTheCaseListsStableIds = new LinkedHashSet<>(this.addToCaseListsStableIds); - // TODO has the all case list always to exist? - String allCaseListStableId = this.cancerStudyStableId + "_all"; - // we always add sample to the all case list - addSamplesToTheCaseListsStableIds.add(allCaseListStableId); - for (String caseListStableId: addSamplesToTheCaseListsStableIds) { + for (Map.Entry> caseListStableIdToSampleIds: caseListSampleIdToSampleIds.entrySet()) { + String caseListStableId = caseListStableIdToSampleIds.getKey(); + Set sampleIds = caseListStableIdToSampleIds.getValue(); SampleList sampleList = daoSampleList.getSampleListByStableId(caseListStableId); if (sampleList == null) { throw new RuntimeException("No case list with " + caseListStableId + " stable id is found"); } - LinkedHashSet newCaseListSampleIds = new LinkedHashSet<>(this.sampleIds); + LinkedHashSet newCaseListSampleIds = new LinkedHashSet<>(sampleIds); newCaseListSampleIds.addAll(sampleList.getSampleList()); ArrayList newSampleArrayList = new ArrayList<>(newCaseListSampleIds); sampleList.setSampleList(newSampleArrayList); @@ -93,11 +133,11 @@ private void updateCaseLists() { CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(this.cancerStudyStableId); List sampleLists = daoSampleList.getAllSampleLists(cancerStudy.getInternalId()); List remainingLists = sampleLists.stream().filter(sl -> - !addSamplesToTheCaseListsStableIds.contains(sl.getStableId()) && sl.getSampleList().stream().anyMatch(this.sampleIds::contains) + !caseListSampleIdToSampleIds.containsKey(sl.getStableId()) && sl.getSampleList().stream().anyMatch(this.allSampleIds::contains) ).collect(Collectors.toList()); for (SampleList remainingList: remainingLists) { ArrayList newSampleList = new ArrayList<>(remainingList.getSampleList()); - newSampleList.removeAll(this.sampleIds); + newSampleList.removeAll(this.allSampleIds); remainingList.setSampleList(newSampleList); //TODO for optimization purpose we could supply to the update method 2 set of samples: samples that have to be added and samples that have to be removed daoSampleList.updateSampleListList(remainingList); @@ -107,11 +147,11 @@ private void updateCaseLists() { } } - private void readSampleIdsFromDataFile() { - this.sampleIds = new LinkedHashSet<>(); + private LinkedHashSet readSampleIdsFromDataFile(File dataFile) { + LinkedHashSet allSampleIds = new LinkedHashSet<>(); FileReader reader = null; try { - reader = new FileReader(this.dataFile); + reader = new FileReader(dataFile); try (BufferedReader buff = new BufferedReader(reader)) { String line; int sampleIdPosition = -1; @@ -128,9 +168,10 @@ private void readSampleIdsFromDataFile() { throw new RuntimeException("No SAMPLE_ID header is found"); } } else { - sampleIds.add(fieldValues[sampleIdPosition].trim()); + allSampleIds.add(fieldValues[sampleIdPosition].trim()); } } + return allSampleIds; } } catch (Exception e) { throw new RuntimeException(e); @@ -163,17 +204,24 @@ private void parseArguments() { String description = "Updates (adds/removes) sample ids in specified case lists."; OptionParser parser = new OptionParser(); - //TODO Do we want to have --sample-ids option instead to make command more flexible which samples we want to add to a given profile? - OptionSpec meta = parser.accepts( "meta", - "clinical sample (genetic_alteration_type=CLINICAL and datatype=SAMPLE_ATTRIBUTES) meta data file" ).withRequiredArg().required().describedAs( "meta_clinical_sample.txt" ).ofType( String.class ); - OptionSpec addToCaseLists = parser.accepts( "add-to-case-lists", - "comma-separated list of case list stable ids to add sample ids found in the data file" ).withRequiredArg().describedAs( "study_id_mrna,study_id_sequenced" ).ofType( String.class ); + OptionSpec metaOpt = parser.accepts( "meta", + "clinical sample (genetic_alteration_type=CLINICAL and datatype=SAMPLE_ATTRIBUTES or datatype=MIXED_ATTRIBUTES) meta data file. All sample ids found in the file will be added to the _all case list." ).withRequiredArg().required().describedAs( "meta_clinical_sample.txt" ).ofType( String.class ); + OptionSpec caseListDirOrFileOpt = parser.accepts( "case-lists", + "case list file or a directory with case list files" ).withRequiredArg().describedAs( "case_lists/" ).ofType( String.class ); try { OptionSet options = parser.parse( args ); - this.metaFile = new File(options.valueOf(meta)); - if(options.has(addToCaseLists)){ - this.addToCaseListsStableIds = new LinkedHashSet<>(List.of(options.valueOf(addToCaseLists).split(","))); + this.metaFile = new File(options.valueOf(metaOpt)); + if(options.has(caseListDirOrFileOpt)){ + File caseListDirOrFile = new File(options.valueOf(caseListDirOrFileOpt)); + if (caseListDirOrFile.isDirectory()) { + this.caseListFiles = Arrays.stream(Objects.requireNonNull(caseListDirOrFile.listFiles())) + .filter(file -> !file.getName().startsWith(".") && !file.getName().endsWith("~")).collect(Collectors.toList()); + } else if (caseListDirOrFile.isFile()) { + this.caseListFiles = List.of(caseListDirOrFile); + } else { + throw new RuntimeException("No file " + caseListDirOrFile.getAbsolutePath() + " exists"); + } } } catch (OptionException e) { throw new UsageException( diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java index 4564386a..a98663b4 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java @@ -66,7 +66,6 @@ public class ITUpdateCaseListsSampleIds { @Test public void testAddSampleIdToAllCaseList() throws DaoException { String sampleIdToAdd = "TCGA-XX-0800-01"; - String allCaseListStableId = "study_tcga_pub_all"; File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/add_sample_to_case_list/"); File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); @@ -89,12 +88,13 @@ public void testAddSampleIdToMrnaCaseList() throws DaoException { String sampleIdToAdd = "TCGA-XX-0800-01"; File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/add_sample_to_case_list/"); File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File caseListsDir = new File(singleTcgaSampleFolder, "case_lists/"); assertSampleIdNotInCaseLists(sampleIdToAdd, "study_tcga_pub_all", "study_tcga_pub_mrna"); UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { "--meta", metaFile.getAbsolutePath(), - "--add-to-case-lists", "study_tcga_pub_mrna" + "--case-lists", caseListsDir.getAbsolutePath() }); importClinicalData.run(); @@ -119,15 +119,16 @@ public void testReAddingSampleToTheSameListShouldWork() throws DaoException { "study_tcga_pub_methylation_hm27", }; - File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_single_tcga_sample/"); + File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/update_tcga_samples/"); File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File caseListsDir = new File(singleTcgaSampleFolder, "case_lists/"); assertSampleIdInCaseLists(sampleIdToAdd, caseListsSampleIsPartOf); assertSampleIdNotInCaseLists(sampleIdToAdd, caseListsSampleIsNotPartOf); UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { "--meta", metaFile.getAbsolutePath(), - "--add-to-case-lists", String.join(",", caseListsSampleIsPartOf) + "--case-lists", caseListsDir.getAbsolutePath() }); importClinicalData.run(); @@ -142,12 +143,14 @@ public void testReAddingSampleToTheSameListShouldWork() throws DaoException { public void testRemovingSampleIdsFromNotSpecifiedCaseLists() throws DaoException { String sampleIdToAdd = "TCGA-A1-A0SH-01"; - File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_single_tcga_sample/"); + File singleTcgaSampleFolder = new File("src/test/resources/update_case_lists/update_tcga_samples/"); File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_sample.txt"); + File caseListsDir = new File(singleTcgaSampleFolder, "case_lists/"); + File caseAcghFile = new File(caseListsDir, "case_acgh.txt"); UpdateCaseListsSampleIds importClinicalData = new UpdateCaseListsSampleIds(new String[] { "--meta", metaFile.getAbsolutePath(), - "--add-to-case-lists", "study_tcga_pub_acgh", + "--case-lists", caseAcghFile.getAbsolutePath() }); importClinicalData.run(); diff --git a/src/test/resources/update_case_lists/add_sample_to_case_list/case_lists/case_mrna.txt b/src/test/resources/update_case_lists/add_sample_to_case_list/case_lists/case_mrna.txt new file mode 100644 index 00000000..32619f58 --- /dev/null +++ b/src/test/resources/update_case_lists/add_sample_to_case_list/case_lists/case_mrna.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_mrna +case_list_ids: TCGA-XX-0800-01 \ No newline at end of file diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_acgh.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_acgh.txt new file mode 100644 index 00000000..95f18659 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_acgh.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_acgh +case_list_ids: TCGA-A1-A0SH-01 TCGA-XX-0800-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_cnaseq.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_cnaseq.txt new file mode 100644 index 00000000..0101d819 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_cnaseq.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_cnaseq +case_list_ids: TCGA-A1-A0SH-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_complete.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_complete.txt new file mode 100644 index 00000000..d2bb08b4 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_complete.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_complete +case_list_ids: TCGA-A1-A0SH-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_log2CNA.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_log2CNA.txt new file mode 100644 index 00000000..e8c3baec --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_log2CNA.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_log2CNA +case_list_ids: TCGA-A1-A0SH-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_mrna.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_mrna.txt new file mode 100644 index 00000000..093ed0fe --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_mrna.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_mrna +case_list_ids: TCGA-A1-A0SH-01 TCGA-XX-0800-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_sequenced.txt b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_sequenced.txt new file mode 100644 index 00000000..03c493c9 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/case_lists/case_sequenced.txt @@ -0,0 +1,3 @@ +cancer_study_identifier: study_tcga_pub +stable_id: study_tcga_pub_sequenced +case_list_ids: TCGA-A1-A0SH-01 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/clinical_data_sample.txt b/src/test/resources/update_case_lists/update_tcga_samples/clinical_data_sample.txt new file mode 100644 index 00000000..1e7552e2 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/clinical_data_sample.txt @@ -0,0 +1,7 @@ +#Patient Identifier Sample Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient Identifier Sample identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +SAMPLE_ID PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-A1-A0SH-01 TCGA-A1-A0SH 1:DECEASED 45.67 1:Recurred/Progressed 123 +TCGA-XX-0800-01 TCGA-XX-0800 0:LIVING 56.78 1:Recurred/Progressed 234 diff --git a/src/test/resources/update_case_lists/update_tcga_samples/meta_clinical_sample.txt b/src/test/resources/update_case_lists/update_tcga_samples/meta_clinical_sample.txt new file mode 100644 index 00000000..09c7ba99 --- /dev/null +++ b/src/test/resources/update_case_lists/update_tcga_samples/meta_clinical_sample.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: clinical_data_sample.txt From 5fc633b28f3a36a229607794afa36a198b6f6c4f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 29 Mar 2024 20:40:22 +0100 Subject: [PATCH 011/130] Fix test clinical data headers --- .../insert_single_tcga_sample/clinical_data_single_SAMPLE.txt | 4 ++-- .../update_single_tcga_sample/clinical_data_single_SAMPLE.txt | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt index e8b77aec..1feeebbc 100644 --- a/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt +++ b/src/test/resources/incremental/insert_single_tcga_sample/clinical_data_single_SAMPLE.txt @@ -1,5 +1,5 @@ -#Patient Identifier Sample Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) -#Patient Identifier Sample identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#Sample Identifier Patient Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Sample identifier Patient Identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment #STRING STRING STRING STRING NUMBER STRING NUMBER #1 1 1 1 1 1 1 SAMPLE_ID PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS diff --git a/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt b/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt index 5088066f..12d0b7c0 100644 --- a/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt +++ b/src/test/resources/incremental/update_single_tcga_sample/clinical_data_single_SAMPLE.txt @@ -1,5 +1,5 @@ -#Patient Identifier Sample Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) -#Patient Identifier Sample identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#Sample Identifier Patient Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Sample identifier Patient Identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment #STRING STRING STRING NUMBER STRING NUMBER #1 1 1 1 1 1 SAMPLE_ID PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS From f7132c9762e223fcc682a65b7358582c89a3dfa8 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Mon, 1 Apr 2024 16:37:12 +0200 Subject: [PATCH 012/130] Test incremental patient upload --- .../ITIncrementalPatientsImport.java | 100 ++++++++++++++++++ .../clinical_data_single_PATIENT.txt | 6 ++ .../meta_clinical_patient.txt | 4 + 3 files changed, 110 insertions(+) create mode 100644 src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java create mode 100644 src/test/resources/incremental/insert_single_tcga_patient/clinical_data_single_PATIENT.txt create mode 100644 src/test/resources/incremental/insert_single_tcga_patient/meta_clinical_patient.txt diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java new file mode 100644 index 00000000..78dea6ff --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. + * + * This library is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS + * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder + * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no + * obligations to provide maintenance, support, updates, enhancements or + * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be + * liable to any party for direct, indirect, special, incidental or + * consequential damages, including lost profits, arising out of the use of this + * software and its documentation, even if Memorial Sloan-Kettering Cancer + * Center has been advised of the possibility of such damage. + */ + +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.scripts.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.scripts.ImportClinicalData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +/** + * Tests Incremental Import of Sample Clinical Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class ITIncrementalPatientsImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + + @Test + public void testInsertNewPatient() throws DaoException { + String newPatientId = "TEST-INC-TCGA-P2"; + File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_single_tcga_patient/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_patient.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_PATIENT.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importClinicalData.run(); + + Patient newPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), newPatientId); + assertNotNull("Patient with id " + newPatientId + " has to be injected to the DB.", newPatient); + + List clinicalData = DaoClinicalData.getData(cancerStudy.getInternalId(), List.of(newPatientId)); + Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "SUBTYPE", "basal-like", + "OS_STATUS", "0:LIVING", + "OS_MONTHS", "45.6", + "DFS_STATUS", "1:Recurred/Progressed"), sampleAttrs); + } +} diff --git a/src/test/resources/incremental/insert_single_tcga_patient/clinical_data_single_PATIENT.txt b/src/test/resources/incremental/insert_single_tcga_patient/clinical_data_single_PATIENT.txt new file mode 100644 index 00000000..38b9ef9f --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_patient/clinical_data_single_PATIENT.txt @@ -0,0 +1,6 @@ +#Patient Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TEST-INC-TCGA-P2 basal-like 0:LIVING 45.6 1:Recurred/Progressed NA diff --git a/src/test/resources/incremental/insert_single_tcga_patient/meta_clinical_patient.txt b/src/test/resources/incremental/insert_single_tcga_patient/meta_clinical_patient.txt new file mode 100644 index 00000000..9e418c43 --- /dev/null +++ b/src/test/resources/incremental/insert_single_tcga_patient/meta_clinical_patient.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: PATIENT_ATTRIBUTES +data_filename: clinical_data_single_PATIENT.txt From f45e1e80d94daa0a4c11f20e5d56ec79907e0b61 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 2 Apr 2024 16:16:28 +0200 Subject: [PATCH 013/130] Add flag to reload patient clinical attributes --- .../cbio/portal/dao/DaoClinicalData.java | 20 +++++++++++ .../portal/scripts/ImportClinicalData.java | 5 ++- .../ITIncrementalPatientsImport.java | 33 +++++++++++++++++++ .../clinical_data_single_PATIENT.txt | 6 ++++ .../meta_clinical_patient.txt | 4 +++ 5 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 src/test/resources/incremental/update_single_tcga_patient/clinical_data_single_PATIENT.txt create mode 100644 src/test/resources/incremental/update_single_tcga_patient/meta_clinical_patient.txt diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java index 8e8acaf1..1b2ccd25 100755 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java @@ -54,6 +54,8 @@ public final class DaoClinicalData { private static final String PATIENT_INSERT = "INSERT INTO " + PATIENT_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; private static final String SAMPLE_DELETE = "DELETE FROM " + SAMPLE_TABLE + " WHERE `INTERNAL_ID` = ?"; + + private static final String PATIENT_DELETE = "DELETE FROM " + PATIENT_TABLE + " WHERE `INTERNAL_ID` = ?"; private static final Map sampleAttributes = new HashMap(); private static final Map patientAttributes = new HashMap(); @@ -679,4 +681,22 @@ public static Map> getCancerTypeInfoBySamples(List s JdbcUtil.closeAll(DaoClinicalData.class, con, pstmt, rs); } } + + public static void removePatientData(int internalPatientId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoClinicalData.class); + pstmt = con.prepareStatement(PATIENT_DELETE); + pstmt.setInt(1, internalPatientId); + pstmt.executeUpdate(); + } + catch (SQLException e) { + throw new DaoException(e); + } + finally { + JdbcUtil.closeAll(DaoClinicalData.class, con, pstmt, rs); + } + } } \ No newline at end of file diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java index 0978354f..d2143aff 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java @@ -333,8 +333,8 @@ private boolean addDatum(String[] fields, List columnAttrs, M //check if sample is not already added: Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), stableSampleId, false); if (sample != null) { + internalSampleId = sample.getInternalId(); if (overwriteExisting) { - internalSampleId = sample.getInternalId(); DaoClinicalData.removeSampleData(internalSampleId); } else { //this should be a WARNING in case of TCGA studies (see https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415) @@ -356,6 +356,9 @@ private boolean addDatum(String[] fields, List columnAttrs, M if (patient != null) { //patient exists, get internal id: internalPatientId = patient.getInternalId(); + if (overwriteExisting) { + DaoClinicalData.removePatientData(internalPatientId); + } } else { //add patient: diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java index 78dea6ff..4ca89e21 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java @@ -97,4 +97,37 @@ public void testInsertNewPatient() throws DaoException { "OS_MONTHS", "45.6", "DFS_STATUS", "1:Recurred/Progressed"), sampleAttrs); } + + @Test + public void testUpdatePatientAttributes() throws DaoException { + String updatedPatientId = "TCGA-A1-A0SB"; + + Patient tcgaPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), + updatedPatientId); + DaoClinicalData.addPatientDatum(tcgaPatient.getInternalId(), "SUBTYPE", "Luminal A"); + DaoClinicalData.addPatientDatum(tcgaPatient.getInternalId(), "OS_STATUS", "0:LIVING"); + DaoClinicalData.addPatientDatum(tcgaPatient.getInternalId(), "OS_MONTHS", "34.56"); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/update_single_tcga_patient/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_clinical_patient.txt"); + File dataFile = new File(singleTcgaSampleFolder, "clinical_data_single_PATIENT.txt"); + + ImportClinicalData importClinicalData = new ImportClinicalData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importClinicalData.run(); + + Patient newPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), updatedPatientId); + assertNotNull("Patient with id " + updatedPatientId + " has to be injected to the DB.", newPatient); + + List clinicalData = DaoClinicalData.getData(cancerStudy.getInternalId(), List.of(updatedPatientId)); + Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "SUBTYPE", "basal-like", + "OS_MONTHS", "56.7", + "DFS_STATUS", "1:Recurred/Progressed", + "DFS_MONTHS", "100"), sampleAttrs); + } } diff --git a/src/test/resources/incremental/update_single_tcga_patient/clinical_data_single_PATIENT.txt b/src/test/resources/incremental/update_single_tcga_patient/clinical_data_single_PATIENT.txt new file mode 100644 index 00000000..37421482 --- /dev/null +++ b/src/test/resources/incremental/update_single_tcga_patient/clinical_data_single_PATIENT.txt @@ -0,0 +1,6 @@ +#Patient Identifier Subtype Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient identifier Subtype description Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 1 +PATIENT_ID SUBTYPE OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-A1-A0SB basal-like NA 56.7 1:Recurred/Progressed 100 diff --git a/src/test/resources/incremental/update_single_tcga_patient/meta_clinical_patient.txt b/src/test/resources/incremental/update_single_tcga_patient/meta_clinical_patient.txt new file mode 100644 index 00000000..9e418c43 --- /dev/null +++ b/src/test/resources/incremental/update_single_tcga_patient/meta_clinical_patient.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: PATIENT_ATTRIBUTES +data_filename: clinical_data_single_PATIENT.txt From 8cc95a0c3060c32d15d8f4d5d5fc0915aee9ab0a Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 3 Apr 2024 12:58:23 +0200 Subject: [PATCH 014/130] Add TODO comment to remove MIXED_ATTRIBUTES data type with a reference to the ticket --- .../java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java index d2143aff..a57bc0bb 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java @@ -43,6 +43,7 @@ import java.util.regex.*; import org.apache.commons.collections4.map.MultiKeyMap; +//TODO Remove MIXED_ATTRIBUTES data type https://github.com/cBioPortal/cbioportal-core/issues/31 public class ImportClinicalData extends ConsoleRunnable { public static final String DELIMITER = "\t"; @@ -103,6 +104,7 @@ public static enum AttributeTypes { PATIENT_ATTRIBUTES("PATIENT"), SAMPLE_ATTRIBUTES("SAMPLE"), + @Deprecated MIXED_ATTRIBUTES("MIXED"); private String attributeType; From fa32b7fba53f7999efb6e8bf5cefaaaecb6fc23d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 3 Apr 2024 17:15:14 +0200 Subject: [PATCH 015/130] WIP adopt py script to incremental upload --- scripts/importer/cbioportalImporter.py | 134 +++++++++++++++++++------ scripts/importer/cbioportal_common.py | 9 ++ scripts/importer/metaImport.py | 17 ++-- scripts/importer/validateData.py | 82 +++++++++------ 4 files changed, 175 insertions(+), 67 deletions(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index 97073ff6..b6111b6d 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -39,6 +39,8 @@ from .cbioportal_common import ADD_CASE_LIST_CLASS from .cbioportal_common import VERSION_UTIL_CLASS from .cbioportal_common import run_java +from .cbioportal_common import UPDATE_CASE_LIST_CLASS +from .cbioportal_common import INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES # ------------------------------------------------------------------------------ @@ -101,8 +103,17 @@ def remove_study_id(jvm_args, study_id): args.append("--noprogress") # don't report memory usage and % progress run_java(*args) +def update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir = None): + args = jvm_args.split(' ') + args.append(UPDATE_CASE_LIST_CLASS) + args.append("--meta") + args.append(meta_filename) + if case_lists_file_or_dir: + args.append("--case-lists") + args.append(case_lists_file_or_dir) + run_java(*args) -def import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity = None, meta_file_dictionary = None): +def import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity = None, meta_file_dictionary = None, incremental = False): args = jvm_args.split(' ') # In case the meta file is already parsed in a previous function, it is not @@ -133,6 +144,10 @@ def import_study_data(jvm_args, meta_filename, data_filename, update_generic_ass importer = IMPORTER_CLASSNAME_BY_META_TYPE[meta_file_type] args.append(importer) + if incremental: + if meta_file_type not in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: + raise NotImplementedError("This type does not support incremental upload: {}".format(meta_file_type)) + args.append("--overwrite-existing") if IMPORTER_REQUIRES_METADATA[importer]: args.append("--meta") args.append(meta_filename) @@ -212,11 +227,20 @@ def process_command(jvm_args, command, meta_filename, data_filename, study_ids, else: raise RuntimeError('Your command uses both -id and -meta. Please, use only one of the two parameters.') elif command == IMPORT_STUDY_DATA: - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity) elif command == IMPORT_CASE_LIST: import_case_list(jvm_args, meta_filename) -def process_directory(jvm_args, study_directory, update_generic_assay_entity = None): +def get_meta_filenames(data_directory): + meta_filenames = ( + os.path.join(data_directory, meta_filename) for + meta_filename in os.listdir(data_directory) if + re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, + flags=re.IGNORECASE) and + not (meta_filename.startswith('.') or meta_filename.endswith('~'))) + return meta_filenames + +def process_study_directory(jvm_args, study_directory, update_generic_assay_entity = None): """ Import an entire study directory based on meta files found. @@ -241,12 +265,7 @@ def process_directory(jvm_args, study_directory, update_generic_assay_entity = N cna_long_filepair = None # Determine meta filenames in study directory - meta_filenames = ( - os.path.join(study_directory, meta_filename) for - meta_filename in os.listdir(study_directory) if - re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, - flags=re.IGNORECASE) and - not (meta_filename.startswith('.') or meta_filename.endswith('~'))) + meta_filenames = get_meta_filenames(study_directory) # Read all meta files (excluding case lists) to determine what to import for meta_filename in meta_filenames: @@ -353,53 +372,53 @@ def process_directory(jvm_args, study_directory, update_generic_assay_entity = N raise RuntimeError('No sample attribute file found') else: meta_filename, data_filename = sample_attr_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Next, we need to import resource definitions for resource data if resource_definition_filepair is not None: meta_filename, data_filename = resource_definition_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Next, we need to import sample definitions for resource data if sample_resource_filepair is not None: meta_filename, data_filename = sample_resource_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Next, import everything else except gene panel, structural variant data, GSVA and # z-score expression. If in the future more types refer to each other, (like # in a tree structure) this could be programmed in a recursive fashion. for meta_filename, data_filename in regular_filepairs: - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import structural variant data if structural_variant_filepair is not None: meta_filename, data_filename = structural_variant_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import cna data if cna_long_filepair is not None: meta_filename, data_filename = cna_long_filepair - import_study_data(jvm_args=jvm_args, meta_filename=meta_filename, data_filename=data_filename, - meta_file_dictionary=study_meta_dictionary[meta_filename]) + import_data(jvm_args=jvm_args, meta_filename=meta_filename, data_filename=data_filename, + meta_file_dictionary=study_meta_dictionary[meta_filename]) # Import expression z-score (after expression) for meta_filename, data_filename in zscore_filepairs: - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import GSVA genetic profiles (after expression and z-scores) if gsva_score_filepair is not None: # First import the GSVA score data meta_filename, data_filename = gsva_score_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Second import the GSVA p-value data meta_filename, data_filename = gsva_pvalue_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) if gene_panel_matrix_filepair is not None: meta_filename, data_filename = gene_panel_matrix_filepair - import_study_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, study_meta_dictionary[meta_filename]) # Import the case lists case_list_dirname = os.path.join(study_directory, 'case_lists') @@ -413,6 +432,56 @@ def process_directory(jvm_args, study_directory, update_generic_assay_entity = N update_study_status(jvm_args, study_id) +def process_data_directory(jvm_args, data_directory, update_generic_assay_entity = None): + """ + Incremental import of data directory based on meta files found. + + 1. Determine meta files in directory. + 2. Read all meta files and determine file types. + 3. Import data files in specific order by file type with the incremental flag. + """ + + meta_file_type_to_meta_files = {} + + # Determine meta filenames in study directory + meta_filenames = get_meta_filenames(data_directory) + + # Read all meta files (excluding case lists) to determine what to import + for meta_filename in meta_filenames: + + # Parse meta file + meta_dictionary = cbioportal_common.parse_metadata_file( + meta_filename, logger=LOGGER) + + # Retrieve meta file type + meta_file_type = meta_dictionary['meta_file_type'] + if meta_file_type is None: + # invalid meta file, let's die + raise RuntimeError('Invalid meta file: ' + meta_filename) + if meta_file_type not in meta_file_type_to_meta_files: + meta_file_type_to_meta_files[meta_file_type] = [] + + meta_file_type_to_meta_files[meta_file_type].append((meta_filename, meta_dictionary)) + + + not_supported_meta_types = meta_file_type_to_meta_files.keys() - INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES + if not_supported_meta_types: + raise NotImplementedError("These types do not support incremental upload: {}".format(", ".join(not_supported_meta_types))) + # TODO it's to fragile to rely on the order of types like that. Too implicit + for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: + meta_filename, meta_dictionary = meta_file_type_to_meta_files[meta_file_type] + data_filename = os.path.join(data_directory, meta_dictionary['data_filename']) + + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_dictionary, incremental=True) + # TODO we could also validate not supported types after loading is done? + + if MetaFileTypes.SAMPLE_ATTRIBUTES in meta_file_type_to_meta_files: + # TODO Make meta attribute optional in command to update case lists + meta_filename, meta_dictionary = meta_file_type_to_meta_files[MetaFileTypes.SAMPLE_ATTRIBUTES][0] + case_list_dirname = os.path.join(data_directory, 'case_lists') + update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir=case_list_dirname if os.path.isdir(case_list_dirname) else None) + + def usage(): # TODO : replace this by usage string from interface() print(('cbioportalImporter.py --jar-path (path to scripts jar file) ' + @@ -435,20 +504,21 @@ def check_files(meta_filename, data_filename): print('data-file cannot be found:' + data_filename, file=ERROR_FILE) sys.exit(2) -def check_dir(study_directory): +def check_dir(data_directory): # check existence of directory - if not os.path.exists(study_directory) and study_directory != '': - print('Study cannot be found: ' + study_directory, file=ERROR_FILE) + if not os.path.exists(data_directory) and data_directory != '': + print('Directory cannot be found: ' + data_directory, file=ERROR_FILE) sys.exit(2) def add_parser_args(parser): - parser.add_argument('-s', '--study_directory', type=str, required=False, - help='Path to Study Directory') + data_source_group = parser.add_mutually_exclusive_group() + data_source_group.add_argument('-s', '--study_directory', type=str, help='Path to Study Directory') + data_source_group.add_argument('-d', '--data_directory', type=str, help='Path to Data Directory') parser.add_argument('-jvo', '--java_opts', type=str, default=os.environ.get('JAVA_OPTS'), help='Path to specify JAVA_OPTS for the importer. \ - (default: gets the JAVA_OPTS from the environment)') + (default: gets the JAVA_OPTS from the environment)') parser.add_argument('-jar', '--jar_path', type=str, required=False, - help='Path to scripts JAR file') + help='Path to scripts JAR file') parser.add_argument('-meta', '--meta_filename', type=str, required=False, help='Path to meta file') parser.add_argument('-data', '--data_filename', type=str, required=False, @@ -547,14 +617,16 @@ def main(args): # process the options jvm_args = "-Dspring.profiles.active=dbcp " + args.java_opts - study_directory = args.study_directory # check if DB version and application version are in sync check_version(jvm_args) - if study_directory != None: - check_dir(study_directory) - process_directory(jvm_args, study_directory, args.update_generic_assay_entity) + if args.data_directory is not None: + check_dir(args.data_directory) + process_data_directory(jvm_args, args.data_directory, args.update_generic_assay_entity) + elif args.study_directory is not None: + check_dir(args.study_directory) + process_study_directory(jvm_args, args.study_directory, args.update_generic_assay_entity) else: check_args(args.command) check_files(args.meta_filename, args.data_filename) diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index 35f71d34..eaa38a5e 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -37,6 +37,7 @@ IMPORT_CANCER_TYPE_CLASS = "org.mskcc.cbio.portal.scripts.ImportTypesOfCancers" IMPORT_CASE_LIST_CLASS = "org.mskcc.cbio.portal.scripts.ImportSampleList" ADD_CASE_LIST_CLASS = "org.mskcc.cbio.portal.scripts.AddCaseList" +UPDATE_CASE_LIST_CLASS = "org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds" VERSION_UTIL_CLASS = "org.mskcc.cbio.portal.util.VersionUtil" PORTAL_PROPERTY_DATABASE_USER = 'db.user' @@ -364,6 +365,14 @@ class MetaFileTypes(object): }, } +# in order of they should be loaded +INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES = [ + MetaFileTypes.PATIENT_ATTRIBUTES, + MetaFileTypes.SAMPLE_ATTRIBUTES, + MetaFileTypes.MUTATION, + # TODO Add more types here as incremental upload is enabled +] + IMPORTER_CLASSNAME_BY_META_TYPE = { MetaFileTypes.STUDY: IMPORT_STUDY_CLASS, MetaFileTypes.CANCER_TYPE: IMPORT_CANCER_TYPE_CLASS, diff --git a/scripts/importer/metaImport.py b/scripts/importer/metaImport.py index 7fdced9f..1e06049e 100755 --- a/scripts/importer/metaImport.py +++ b/scripts/importer/metaImport.py @@ -56,8 +56,11 @@ class Color(object): def interface(): parser = argparse.ArgumentParser(description='cBioPortal meta Importer') - parser.add_argument('-s', '--study_directory', type=str, required=True, - help='path to directory.') + data_source_group = parser.add_mutually_exclusive_group() + data_source_group.add_argument('-s', '--study_directory', + type=str, help='path to study directory.') + data_source_group.add_argument('-d', '--data_directory', + type=str, help='path to directory.') portal_mode_group = parser.add_mutually_exclusive_group() portal_mode_group.add_argument('-u', '--url_server', type=str, @@ -115,7 +118,7 @@ def interface(): # supply parameters that the validation script expects to have parsed args.error_file = False - study_dir = args.study_directory + data_dir = args.data_directory if args.data_directory is not None else args.study_directory # Validate the study directory. print("Starting validation...\n", file=sys.stderr) @@ -139,9 +142,9 @@ def interface(): # Import OncoKB annotations when asked, and there are no validation warnings or warnings are overruled study_is_valid = exitcode == 0 or (exitcode == 3 and args.override_warning) if study_is_valid and args.import_oncokb: - mutation_meta_file_path = libImportOncokb.find_meta_file_by_fields(study_dir, {'genetic_alteration_type': 'MUTATION_EXTENDED'}) + mutation_meta_file_path = libImportOncokb.find_meta_file_by_fields(data_dir, {'genetic_alteration_type': 'MUTATION_EXTENDED'}) mutation_data_file_name = libImportOncokb.find_data_file_from_meta_file(mutation_meta_file_path) - mutation_data_file_path = os.path.join(study_dir, mutation_data_file_name) + mutation_data_file_path = os.path.join(data_dir, mutation_data_file_name) study_is_modified = False print("\n") if os.path.exists(mutation_data_file_path): @@ -163,9 +166,9 @@ def interface(): for log_handler in validator_logger.handlers: log_handler.close() validator_logger.handlers = [] - cna_meta_file_path = libImportOncokb.find_meta_file_by_fields(study_dir, {'genetic_alteration_type': 'COPY_NUMBER_ALTERATION', 'datatype': 'DISCRETE'}) + cna_meta_file_path = libImportOncokb.find_meta_file_by_fields(data_dir, {'genetic_alteration_type': 'COPY_NUMBER_ALTERATION', 'datatype': 'DISCRETE'}) cna_data_file_name = libImportOncokb.find_data_file_from_meta_file(cna_meta_file_path) - cna_data_file_path = os.path.join(study_dir, cna_data_file_name) + cna_data_file_path = os.path.join(data_dir, cna_data_file_name) if os.path.exists(cna_data_file_path): print("Starting import of OncoKB annotations for discrete CNA file ...\n", file=sys.stderr) try: diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 1f473abb..b5f3f3f6 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -70,7 +70,6 @@ DEFINED_SAMPLE_IDS = None DEFINED_SAMPLE_ATTRIBUTES = None PATIENTS_WITH_SAMPLES = None -DEFINED_CANCER_TYPES = None mutation_sample_ids = None mutation_file_sample_ids = set() sample_ids_panel_dict = {} @@ -727,7 +726,7 @@ def checkSampleId(self, sample_id, column_number): Return True if the sample id was valid, False otherwise. """ - if sample_id not in DEFINED_SAMPLE_IDS: + if DEFINED_SAMPLE_IDS is not None and sample_id not in DEFINED_SAMPLE_IDS: self.logger.error( 'Sample ID not defined in clinical file', extra={'line_number': self.line_number, @@ -741,7 +740,7 @@ def checkPatientId(self, patient_id, column_number): Return True if the patient id was valid, False otherwise. """ - if patient_id not in PATIENTS_WITH_SAMPLES: + if PATIENTS_WITH_SAMPLES is not None and patient_id not in PATIENTS_WITH_SAMPLES: self.logger.error( 'Patient ID not defined in clinical file', extra={'line_number': self.line_number, @@ -2871,7 +2870,7 @@ def checkHeader(self, cols): 'cause': 'SAMPLE_ID'}) # refuse to define attributes also defined in the sample-level file for attribute_id in self.defined_attributes: - if attribute_id in DEFINED_SAMPLE_ATTRIBUTES: + if DEFINED_SAMPLE_ATTRIBUTES is not None and attribute_id in DEFINED_SAMPLE_ATTRIBUTES: # log this as a file-aspecific error, using the base logger self.logger.logger.error( 'Clinical attribute is defined both as sample-level and ' @@ -2912,7 +2911,7 @@ def checkLine(self, data): self.patient_id_lines[value])}) else: self.patient_id_lines[value] = self.line_number - if value not in PATIENTS_WITH_SAMPLES: + if PATIENTS_WITH_SAMPLES is not None and value not in PATIENTS_WITH_SAMPLES: self.logger.warning( 'Clinical data defined for a patient with ' 'no samples', @@ -2979,12 +2978,13 @@ def checkLine(self, data): def onComplete(self): """Perform final validations based on the data parsed.""" - for patient_id in PATIENTS_WITH_SAMPLES: - if patient_id not in self.patient_id_lines: - self.logger.warning( - 'Missing clinical data for a patient associated with ' - 'samples', - extra={'cause': patient_id}) + if PATIENTS_WITH_SAMPLES: + for patient_id in PATIENTS_WITH_SAMPLES: + if patient_id not in self.patient_id_lines: + self.logger.warning( + 'Missing clinical data for a patient associated with ' + 'samples', + extra={'cause': patient_id}) super(PatientClinicalValidator, self).onComplete() @@ -4801,11 +4801,6 @@ def process_metadata_files(directory, portal_instance, logger, relaxed_mode, str else: validators_by_type[meta_file_type].append(None) - if study_cancer_type is None: - logger.error( - 'Cancer type needs to be defined for a study. Verify that you have a study file ' - 'and have defined the cancer type correctly.') - # prepend the cancer study id to any case list suffixes defined_case_list_fns = {} if study_id is not None: @@ -4930,7 +4925,7 @@ def processCaseListDirectory(caseListDir, cancerStudyId, logger, for value in seen_sample_ids: # Compare case list sample ids with clinical file - if value not in DEFINED_SAMPLE_IDS: + if DEFINED_SAMPLE_IDS is not None and value not in DEFINED_SAMPLE_IDS: logger.error( 'Sample ID not defined in clinical file', extra={'filename_': case, @@ -5293,8 +5288,11 @@ def load_portal_info(path, logger, offline=False): # ------------------------------------------------------------------------------ def interface(args=None): parser = argparse.ArgumentParser(description='cBioPortal study validator') - parser.add_argument('-s', '--study_directory', - type=str, required=True, help='path to directory.') + data_source_group = parser.add_mutually_exclusive_group() + data_source_group.add_argument('-s', '--study_directory', + type=str, help='path to study directory.') + data_source_group.add_argument('-d', '--data_directory', + type=str, help='path to directory.') portal_mode_group = parser.add_mutually_exclusive_group() portal_mode_group.add_argument('-u', '--url_server', type=str, @@ -5341,7 +5339,6 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ attributes are not None. """ - global DEFINED_CANCER_TYPES global DEFINED_SAMPLE_IDS global DEFINED_SAMPLE_ATTRIBUTES global PATIENTS_WITH_SAMPLES @@ -5369,6 +5366,11 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ stable_ids, tags_file_path) = process_metadata_files(study_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) + if study_cancer_type is None: + logger.error( + 'Cancer type needs to be defined for a study. Verify that you have a study file ' + 'and have defined the cancer type correctly.') + # first parse and validate cancer type files studydefined_cancer_types = [] if cbioportal_common.MetaFileTypes.CANCER_TYPE in validators_by_meta_type: @@ -5545,7 +5547,17 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ # additional validation between meta files, after all meta files are processed validate_data_relations(validators_by_meta_type, logger) - logger.info('Validation complete') + +def validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks): + # walk over the meta files in the dir and get properties of the study + validators_by_meta_type, *_ = process_metadata_files(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) + for meta_file_type, validators in validators_by_meta_type.items(): + # if there was no validator for this meta file + if not validators: + continue + logger.info("Validating %s", meta_file_type) + for validator in validators: + validator.validate() def get_pom_path(): @@ -5565,7 +5577,15 @@ def main_validate(args): logger.addHandler(exit_status_handler) # process the options - study_dir = args.study_directory + if args.study_directory: + data_dir = args.study_directory + partial_data = False + elif args.data_directory: + data_dir = args.data_directory + partial_data = True + else: + raise RuntimeError("Neither study_directory nor data_directory argument is specified.") + server_url = args.url_server html_output_filename = args.html_table @@ -5578,14 +5598,14 @@ def main_validate(args): output_loglevel = logging.DEBUG # check existence of directory - if not os.path.exists(study_dir): - print('directory cannot be found: ' + study_dir, file=sys.stderr) + if not os.path.exists(data_dir): + print('directory cannot be found: ' + data_dir, file=sys.stderr) return 2 # set default message handler text_handler = logging.StreamHandler(sys.stdout) text_handler.setFormatter( - cbioportal_common.LogfileStyleFormatter(study_dir)) + cbioportal_common.LogfileStyleFormatter(data_dir)) collapsing_text_handler = cbioportal_common.CollapsingLogMessageHandler( capacity=5e5, flushLevel=logging.CRITICAL, @@ -5601,7 +5621,7 @@ def main_validate(args): import jinja2 # pylint: disable=import-error html_handler = Jinja2HtmlHandler( - study_dir, + data_dir, html_output_filename, capacity=1e5) # TODO extend CollapsingLogMessageHandler to flush to multiple targets, @@ -5615,7 +5635,7 @@ def main_validate(args): if args.error_file: errfile_handler = logging.FileHandler(args.error_file, 'w') - errfile_handler.setFormatter(ErrorFileFormatter(study_dir)) + errfile_handler.setFormatter(ErrorFileFormatter(data_dir)) # TODO extend CollapsingLogMessageHandler to flush to multiple targets, # and get rid of the duplicated buffering of messages here coll_errfile_handler = cbioportal_common.CollapsingLogMessageHandler( @@ -5644,13 +5664,17 @@ def main_validate(args): # set portal version cbio_version = portal_instance.portal_version - validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) + if partial_data: + validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) + else: + validate_study(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks) if html_handler is not None: # flush logger and generate HTML while overriding cbio_version after retrieving it from the API collapsing_html_handler.flush() html_handler.generateHtml(cbio_version=cbio_version) + logger.info('Validation complete') return exit_status_handler.get_exit_status() @@ -5670,7 +5694,7 @@ def _get_column_index(parts, name): finally: logging.shutdown() del logging._handlerList[:] # workaround for harmless exceptions on exit - print(('Validation of study {status}.'.format( + print(('Validation of data {status}.'.format( status={0: 'succeeded', 1: 'failed', 2: 'not performed as problems occurred', From f044c3b54937743104a6d282070e2c41bcc39285 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 4 Apr 2024 22:45:13 +0200 Subject: [PATCH 016/130] Fix java.sql.SQLException: Generated keys not requested --- src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java index 09e05035..72d1c1b2 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java @@ -54,7 +54,7 @@ public int addSampleList(SampleList sampleList) throws DaoException { con = JdbcUtil.getDbConnection(DaoSampleList.class); pstmt = con.prepareStatement("INSERT INTO sample_list (`STABLE_ID`, `CANCER_STUDY_ID`, `NAME`, `CATEGORY`," + - "`DESCRIPTION`)" + " VALUES (?,?,?,?,?)"); + "`DESCRIPTION`)" + " VALUES (?,?,?,?,?)", Statement.RETURN_GENERATED_KEYS); pstmt.setString(1, sampleList.getStableId()); pstmt.setInt(2, sampleList.getCancerStudyId()); pstmt.setString(3, sampleList.getName()); From 48fca0353bb080710590269a63b9047fe08a1a9b Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 5 Apr 2024 12:37:55 +0200 Subject: [PATCH 017/130] Clean alteration_driver_annotation during mutations inc. upload --- src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java index d2f80527..9fa36f68 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java @@ -1499,10 +1499,17 @@ public static void deleteAllRecordsInGeneticProfileForSample(long geneticProfile ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoMutation.class); + // TODO Move it to another class? + pstmt = con.prepareStatement("DELETE from alteration_driver_annotation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"); + pstmt.setLong(1, geneticProfileId); + pstmt.setLong(2, internalSampleId); + pstmt.executeUpdate(); + pstmt = con.prepareStatement("DELETE from mutation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"); pstmt.setLong(1, geneticProfileId); pstmt.setLong(2, internalSampleId); pstmt.executeUpdate(); + // TODO Remove row in mutation_event if it does not have mutations left // TODO Remove profile if no mutations nor mutation_event(s) left } catch (SQLException e) { From 1302a8e453e31256783cc5b97d5eaf16b9944a02 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 5 Apr 2024 12:40:51 +0200 Subject: [PATCH 018/130] Fix validator and importer py scripts for inc. upload --- scripts/importer/cbioportalImporter.py | 9 ++++--- scripts/importer/validateData.py | 35 ++++++++++++++++++-------- 2 files changed, 30 insertions(+), 14 deletions(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index b6111b6d..8958c258 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -469,10 +469,11 @@ def process_data_directory(jvm_args, data_directory, update_generic_assay_entity raise NotImplementedError("These types do not support incremental upload: {}".format(", ".join(not_supported_meta_types))) # TODO it's to fragile to rely on the order of types like that. Too implicit for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: - meta_filename, meta_dictionary = meta_file_type_to_meta_files[meta_file_type] - data_filename = os.path.join(data_directory, meta_dictionary['data_filename']) - - import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_dictionary, incremental=True) + meta_pairs = meta_file_type_to_meta_files[meta_file_type] + for meta_pair in meta_pairs: + meta_filename, meta_dictionary = meta_pair + data_filename = os.path.join(data_directory, meta_dictionary['data_filename']) + import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_dictionary, incremental=True) # TODO we could also validate not supported types after loading is done? if MetaFileTypes.SAMPLE_ATTRIBUTES in meta_file_type_to_meta_files: diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index b5f3f3f6..aebb3ea1 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -75,7 +75,7 @@ sample_ids_panel_dict = {} # resource globals -RESOURCE_DEFINITION_DICTIONARY = {} +RESOURCE_DEFINITION_DICTIONARY = None RESOURCE_PATIENTS_WITH_SAMPLES = None # globals required for gene set scoring validation @@ -3385,7 +3385,7 @@ def checkLine(self, data): sample_ids_panel_dict[sample_id] = data[self.mutation_stable_id_index - 1] # Sample ID has been removed from list, so subtract 1 position. if data[self.mutation_stable_id_index - 1] != 'NA': - if sample_id not in mutation_sample_ids: + if mutation_sample_ids is not None and sample_id not in mutation_sample_ids: self.logger.error('Sample ID has mutation gene panel, but is not in the sequenced case list', extra={'line_number': self.line_number, 'cause': sample_id}) @@ -3790,7 +3790,8 @@ def checkLine(self, data): 'column_number': col_index + 1, 'cause': value}) # make sure that RESOURCE_ID is defined in the resource definition file - if value not in RESOURCE_DEFINITION_DICTIONARY: + if RESOURCE_DEFINITION_DICTIONARY is not None \ + and value not in RESOURCE_DEFINITION_DICTIONARY: self.logger.error( 'RESOURCE_ID is not defined in resource definition file', extra={'line_number': self.line_number, @@ -3857,13 +3858,17 @@ def checkLine(self, data): value = data[col_index].strip() # make sure RESOURCE_ID is defined correctly if col_name == 'RESOURCE_ID': - if value not in RESOURCE_DEFINITION_DICTIONARY or 'SAMPLE' not in RESOURCE_DEFINITION_DICTIONARY[value]: + if RESOURCE_DEFINITION_DICTIONARY is not None \ + and (value not in RESOURCE_DEFINITION_DICTIONARY \ + or 'SAMPLE' not in RESOURCE_DEFINITION_DICTIONARY[value]): self.logger.error( 'RESOURCE_ID for sample resource is not defined correctly in resource definition file', extra={'line_number': self.line_number, 'column_number': col_index + 1, 'cause': value}) - if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + if RESOURCE_DEFINITION_DICTIONARY is not None \ + and value in RESOURCE_DEFINITION_DICTIONARY \ + and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: self.logger.warning( 'RESOURCE_ID for sample resource has been used by more than one RESOURCE_TYPE', extra={'line_number': self.line_number, @@ -3918,13 +3923,17 @@ def checkLine(self, data): value = data[col_index].strip() # make sure RESOURCE_ID is defined correctly if col_name == 'RESOURCE_ID': - if value not in RESOURCE_DEFINITION_DICTIONARY or 'PATIENT' not in RESOURCE_DEFINITION_DICTIONARY[value]: + if RESOURCE_DEFINITION_DICTIONARY is not None \ + and (value not in RESOURCE_DEFINITION_DICTIONARY \ + or 'PATIENT' not in RESOURCE_DEFINITION_DICTIONARY[value]): self.logger.error( 'RESOURCE_ID for patient resource is not defined correctly in resource definition file', extra={'line_number': self.line_number, 'column_number': col_index + 1, 'cause': value}) - if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + if RESOURCE_DEFINITION_DICTIONARY is not None \ + and value in RESOURCE_DEFINITION_DICTIONARY \ + and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: self.logger.warning( 'RESOURCE_ID for patient resource has been used by more than one RESOURCE_TYPE', extra={'line_number': self.line_number, @@ -3968,13 +3977,17 @@ def checkLine(self, data): value = data[col_index].strip() # make sure RESOURCE_ID is defined correctly if col_name == 'RESOURCE_ID': - if value not in RESOURCE_DEFINITION_DICTIONARY or 'STUDY' not in RESOURCE_DEFINITION_DICTIONARY[value]: + if RESOURCE_DEFINITION_DICTIONARY is not None \ + and (value not in RESOURCE_DEFINITION_DICTIONARY \ + or 'STUDY' not in RESOURCE_DEFINITION_DICTIONARY[value]): self.logger.error( 'RESOURCE_ID for study resource is not defined correctly in resource definition file', extra={'line_number': self.line_number, 'column_number': col_index + 1, 'cause': value}) - if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + if RESOURCE_DEFINITION_DICTIONARY is not None \ + and value in RESOURCE_DEFINITION_DICTIONARY \ + and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: self.logger.warning( 'RESOURCE_ID for study resource has been used by more than one RESOURCE_TYPE', extra={'line_number': self.line_number, @@ -5557,7 +5570,9 @@ def validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_ma continue logger.info("Validating %s", meta_file_type) for validator in validators: - validator.validate() + # TODO skip None's. Why do we even have them? + if validator: + validator.validate() def get_pom_path(): From 659f352c16708767fd631496366a707e717a67fb Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 5 Apr 2024 15:22:01 +0200 Subject: [PATCH 019/130] Add test/demo data for incremental loading of study_es_0 study --- .../case_lists/cases_sequenced.txt | 5 +++++ .../study_es_0_inc/data_clinical_patients.txt | 7 +++++++ .../study_es_0_inc/data_clinical_samples.txt | 8 ++++++++ .../data_mutations_extended.maf | 20 +++++++++++++++++++ .../study_es_0_inc/meta_clinical_patients.txt | 4 ++++ .../study_es_0_inc/meta_clinical_samples.txt | 4 ++++ .../meta_mutations_extended.txt | 10 ++++++++++ 7 files changed, 58 insertions(+) create mode 100644 src/test/scripts/test_data/study_es_0_inc/case_lists/cases_sequenced.txt create mode 100644 src/test/scripts/test_data/study_es_0_inc/data_clinical_patients.txt create mode 100644 src/test/scripts/test_data/study_es_0_inc/data_clinical_samples.txt create mode 100644 src/test/scripts/test_data/study_es_0_inc/data_mutations_extended.maf create mode 100644 src/test/scripts/test_data/study_es_0_inc/meta_clinical_patients.txt create mode 100644 src/test/scripts/test_data/study_es_0_inc/meta_clinical_samples.txt create mode 100644 src/test/scripts/test_data/study_es_0_inc/meta_mutations_extended.txt diff --git a/src/test/scripts/test_data/study_es_0_inc/case_lists/cases_sequenced.txt b/src/test/scripts/test_data/study_es_0_inc/case_lists/cases_sequenced.txt new file mode 100644 index 00000000..c05c77f7 --- /dev/null +++ b/src/test/scripts/test_data/study_es_0_inc/case_lists/cases_sequenced.txt @@ -0,0 +1,5 @@ +cancer_study_identifier: study_es_0 +stable_id: study_es_0_sequenced +case_list_name: Samples profiled for mutations +case_list_description: This is this case list that contains all samples that are profiled for mutations. +case_list_ids: TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW-01 diff --git a/src/test/scripts/test_data/study_es_0_inc/data_clinical_patients.txt b/src/test/scripts/test_data/study_es_0_inc/data_clinical_patients.txt new file mode 100644 index 00000000..c2679ddf --- /dev/null +++ b/src/test/scripts/test_data/study_es_0_inc/data_clinical_patients.txt @@ -0,0 +1,7 @@ +#Patient Identifier Overall Survival Status Overall Survival (Months) Disease Free Status Disease Free (Months) +#Patient identifier Overall survival status Overall survival in months since diagnosis Disease free status Disease free in months since treatment +#STRING STRING NUMBER STRING NUMBER +#1 1 1 1 1 +PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS +TCGA-BH-A18K 1:DECEASED 96.74 1:Recurred/Progressed 36 +TCGA-BH-NEW 0:LIVING 2.37 0:DiseaseFree 2.37 diff --git a/src/test/scripts/test_data/study_es_0_inc/data_clinical_samples.txt b/src/test/scripts/test_data/study_es_0_inc/data_clinical_samples.txt new file mode 100644 index 00000000..40dbf1ea --- /dev/null +++ b/src/test/scripts/test_data/study_es_0_inc/data_clinical_samples.txt @@ -0,0 +1,8 @@ +#Patient Identifier Sample Identifier Subtype +#Patient identifier Sample identifier Subtype description +#STRING STRING STRING +#1 1 1 +PATIENT_ID SAMPLE_ID SUBTYPE +TCGA-A1-A0SB TCGA-A1-A0SB-01 Luminal A +TCGA-A1-A0SB TCGA-A1-A0SB-03 basal-like +TCGA-BH-NEW TCGA-BH-NEW-01 NA diff --git a/src/test/scripts/test_data/study_es_0_inc/data_mutations_extended.maf b/src/test/scripts/test_data/study_es_0_inc/data_mutations_extended.maf new file mode 100644 index 00000000..aa50f1d2 --- /dev/null +++ b/src/test/scripts/test_data/study_es_0_inc/data_mutations_extended.maf @@ -0,0 +1,20 @@ +#version 2.4 +Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer MA:FImpact MA:FIS Amino_Acid_Change MA:link.MSA MA:link.PDB MA:link.var Tumor_Sample_UUID Matched_Norm_Sample_UUID HGVSc HGVSp HGVSp_Short Transcript_ID Exon_Number t_depth t_ref_count t_alt_count n_depth n_ref_count n_alt_count all_effects Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE SYMBOL SYMBOL_SOURCE HGNC_ID BIOTYPE CANONICAL CCDS ENSP SWISSPROT TREMBL UNIPARC RefSeq SIFT PolyPhen EXON INTRON DOMAINS GMAF AFR_MAF AMR_MAF ASN_MAF EAS_MAF EUR_MAF SAS_MAF AA_MAF EA_MAF CLIN_SIG SOMATIC PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE IMPACT PICK VARIANT_CLASS TSL HGVS_OFFSET PHENO chromosome_name_wu start_wu stop_wu reference_wu variant_wu type_wu gene_name_wu transcript_name_wu transcript_species_wu transcript_source_wu transcript_version_wu strand_wu transcript_status_wu trv_type_wu c_position_wu amino_acid_change_wu ucsc_cons_wu domain_wu all_domains_wu deletion_substructures_wu transcript_error_wu default_gene_name_wu gene_name_source_wu ensembl_gene_id normal_ref_reads normal_var_reads normal_vaf tumor_ref_reads tumors_var_reads tumor_vaf evs_ea evs_aa evs_all chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU c_position_WU amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU transcript_error_WU default_gene_name_WU gene_name_source_WU EVS_EA EVS_AA EVS_All cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation Zygosity.name Zygosity.code +OR11H1 genome.wustl.edu GRCh37 22 16449539 16449539 -1 Missense_Mutation SNP A A G TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A A Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx low 1.49 V89A getma.org/?cm=msa&ty=f&p=O11H1_HUMAN&rb=1&re=154&var=V89A getma.org/?cm=var&var=hg19,22,16449539,A,G&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.266T>C p.Val89Ala p.V89A ENST00000252835 1/1 0 0 OR11H1,missense_variant,p.Val89Ala,ENST00000252835,NM_001005239.1; G ENSG00000130538 ENST00000252835 Transcript missense_variant 267/982 266/981 89/326 V/A gTc/gCc rs199856986,COSM1484040 1 OR11H1 HGNC 15404 protein_coding YES CCDS33594.1 ENSP00000252835 O11H1_HUMAN UPI000004B1CF NM_001005239.1 deleterious(0.02) possibly_damaging(0.589) 1/1 Transmembrane_helices:TMhelix,PROSITE_profiles:PS50262,hmmpanther:PTHR24242:SF201,hmmpanther:PTHR24242,Gene3D:1.20.1070.10,Superfamily_domains:SSF81321 0,1 MODERATE 1 SNV 0,1 22 16449539 16449539 A G SNP OR11H1 ENST00000252835 human ensembl 69_37n -1 known missense c.266 p.V89A 234 pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam,prints_Olfact_rcpt,prints_7TM_GPCR_Rhodpsn - no_errors OR11H1 HGNC ENSG00000130538 65 0 0 38 6 13.64 - - - 22 16449539 16449539 A G SNP OR11H1 ENST00000252835 human ensembl 69_37n -1 known missense c.266 p.V89A 234 pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam,prints_Olfact_rcpt,prints_7TM_GPCR_Rhodpsn - no_errors OR11H1 HGNC - - - Putative_Driver Test driver +TMEM247 genome.wustl.edu GRCh37 2 46707888 46707888 1 Frame_Shift_Del DEL G G - TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 G G Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.463delG p.Ala155ArgfsTer59 p.A155Rfs*59 ENST00000434431 2/3 0 0 TMEM247,frameshift_variant,p.Ala155ArgfsTer59,ENST00000434431,NM_001145051.2;TMEM247,intron_variant,,ENST00000432241,; - ENSG00000187600 ENST00000434431 Transcript frameshift_variant 462/659 462/659 154/219 E/X gaG/ga COSM1408208,~rs70940616 1 TMEM247 HGNC 42967 protein_coding YES CCDS56117.1 ENSP00000388684 TM247_HUMAN UPI0000366EF8 NM_001145051.2 2/3 Coiled-coils_(Ncoils):Coil,Pfam_domain:PF15444 -:0.0202 -:0.0439 1 HIGH 1 deletion 1 1 2 46707888 46707888 G - DEL TMEM247 ENST00000434431 human ensembl 69_37n 1 known frame_shift_del c.462 p.A155fs 83 - no_stop_codon:bad_bp_length_for_coding_region TMEM247 HGNC ENSG00000187600 20 0 0 7 3 30 - - - 2 46707888 46707888 G - DEL TMEM247 ENST00000434431 human ensembl 69_37n 1 known frame_shift_del c.462 p.A155fs 83 - no_stop_codon:bad_bp_length_for_coding_region TMEM247 HGNC - - - Putative_Passenger Test passenger Class 2 Class annotation +ABLIM1 genome.wustl.edu GRCh37 10 116247760 116247760 -1 Missense_Mutation SNP T C C TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 3.39 H333R getma.org/?cm=msa&ty=f&p=ABLM1_HUMAN&rb=285&re=339&var=H333R getma.org/pdb.php?prot=ABLM1_HUMAN&from=285&to=339&var=H333R getma.org/?cm=var&var=hg19,10,116247760,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.998A>G p.His333Arg p.H333R ENST00000277895 8/23 0 0 ABLIM1,missense_variant,p.His273Arg,ENST00000533213,;ABLIM1,missense_variant,p.His273Arg,ENST00000369252,NM_001003408.1,NM_001003407.1;ABLIM1,missense_variant,p.His17Arg,ENST00000392952,NM_006720.3;ABLIM1,missense_variant,p.His17Arg,ENST00000369266,;ABLIM1,missense_variant,p.His333Arg,ENST00000277895,NM_002313.5;ABLIM1,missense_variant,p.His17Arg,ENST00000369253,;ABLIM1,missense_variant,p.His17Arg,ENST00000428430,;ABLIM1,upstream_gene_variant,,ENST00000440467,;ABLIM1,missense_variant,p.His273Arg,ENST00000392955,;ABLIM1,missense_variant,p.His273Arg,ENST00000369256,; C ENSG00000099204 ENST00000277895 Transcript missense_variant 1096/2657 998/2337 333/778 H/R cAt/cGt COSM1474374,COSM1474373,COSM1474375 1 ABLIM1 HGNC 78 protein_coding YES CCDS7590.1 ENSP00000277895 ABLM1_HUMAN UPI0000418D06 NM_002313.5 deleterious(0) probably_damaging(0.988) 8/23 PROSITE_profiles:PS50023,hmmpanther:PTHR24213:SF18,hmmpanther:PTHR24213,Gene3D:2.10.110.10,SMART_domains:SM00132,Superfamily_domains:SSF57716 1,1,1 MODERATE 1 SNV 1,1,1 10 116247760 116247760 T C SNP ABLIM1 ENST00000277895 human ensembl 69_37n -1 known missense c.998 p.H333R 1000 smart_Znf_LIM pfam_Znf_LIM,pfam_Villin_headpiece,superfamily_Villin_headpiece,smart_Znf_LIM,smart_Villin_headpiece,pfscan_Villin_headpiece,pfscan_Znf_LIM - no_errors ABLIM1 HGNC ENSG00000099204 77 0 0 36 13 26.53 - - - 10 116247760 116247760 T C SNP ABLIM1 ENST00000277895 human ensembl 69_37n -1 known missense c.998 p.H333R 1000 smart_Znf_LIM pfam_Znf_LIM,pfam_Villin_headpiece,superfamily_Villin_headpiece,smart_Znf_LIM,smart_Villin_headpiece,pfscan_Villin_headpiece,pfscan_Znf_LIM - no_errors ABLIM1 HGNC - - - Putative_Driver Test driver Class 1 Class annotation +ADAMTS20 genome.wustl.edu GRCh37 12 43944926 43944926 -1 Missense_Mutation SNP T T C TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 2.85 Y80C getma.org/?cm=msa&ty=f&p=ATS20_HUMAN&rb=40&re=186&var=Y80C getma.org/?cm=var&var=hg19,12,43944926,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 p.Tyr80Cys p.Y80C ENST00000389420 2/39 0 0 ADAMTS20,missense_variant,p.Tyr80Cys,ENST00000389420,NM_025003.3;ADAMTS20,missense_variant,p.Tyr80Cys,ENST00000553158,; C ENSG00000173157 ENST00000389420 Transcript missense_variant 239/6076 239/5733 80/1910 Y/C tAt/tGt COSM1476552,COSM1476551 1 ADAMTS20 HGNC 17178 protein_coding YES CCDS31778.2 ENSP00000374071 ATS20_HUMAN UPI00004565F4 NM_025003.3 deleterious(0) probably_damaging(1) 2/39 hmmpanther:PTHR13723,hmmpanther:PTHR13723:SF165,Pfam_domain:PF01562 1,1 MODERATE 1 SNV 1,1 12 43944926 43944926 T C SNP ADAMTS20 ENST00000389420 human ensembl 69_37n -1 known missense c.239 p.Y80C 1000 pfam_Peptidase_M12B_N pfam_Pept_M12B_GON-ADAMTSs,pfam_Thrombospondin_1_rpt,pfam_Peptidase_M12B_N,pfam_ADAM_spacer1,pfam_Peptidase_M12B,superfamily_Thrombospondin_1_rpt,smart_Thrombospondin_1_rpt,prints_Peptidase_M12B_ADAM-TS,pfscan_Pept_M12B_GON-ADAMTSs,pfscan_Thrombospondin_1_rpt,pfscan_Peptidase_M12B - no_errors ADAMTS20 HGNC ENSG00000173157 50 0 0 19 17 45.95 - - - 12 43944926 43944926 T C SNP ADAMTS20 ENST00000389420 human ensembl 69_37n -1 known missense c.239 p.Y80C 1000 pfam_Peptidase_M12B_N pfam_Pept_M12B_GON-ADAMTSs,pfam_Thrombospondin_1_rpt,pfam_Peptidase_M12B_N,pfam_ADAM_spacer1,pfam_Peptidase_M12B,superfamily_Thrombospondin_1_rpt,smart_Thrombospondin_1_rpt,prints_Peptidase_M12B_ADAM-TS,pfscan_Pept_M12B_GON-ADAMTSs,pfscan_Thrombospondin_1_rpt,pfscan_Peptidase_M12B - no_errors ADAMTS20 HGNC - - - Unknown Class 4 Class annotation +DTNB genome.wustl.edu GRCh37 2 25678299 25678299 -1 Missense_Mutation SNP C G T TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 C C Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 2125 V382M getma.org/?cm=msa&ty=f&p=DTNB_HUMAN&rb=283&re=473&var=V382M getma.org/?cm=var&var=hg19,2,25678299,C,T&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.1144C>A p.Val382Met p.V382M ENST00000406818 11/21 0 0 DTNB,missense_variant,p.Val382Met,ENST00000406818,NM_001256303.1,NM_021907.4;DTNB,missense_variant,p.Val382Met,ENST00000407661,NM_183360.2,NM_001256304.1;DTNB,missense_variant,p.Val382Met,ENST00000404103,NM_033147.3;DTNB,missense_variant,p.Val382Met,ENST00000288642,;DTNB,missense_variant,p.Val325Met,ENST00000496972,NM_001256308.1;DTNB,missense_variant,p.Val178Met,ENST00000545439,;DTNB,intron_variant,,ENST00000407038,NM_033148.3;DTNB,intron_variant,,ENST00000407186,;DTNB,intron_variant,,ENST00000405222,NM_183361.2;DTNB,intron_variant,,ENST00000489756,;DTNB,intron_variant,,ENST00000481841,;DTNB,intron_variant,,ENST00000486555,;DTNB,3_prime_UTR_variant,,ENST00000398951,;DTNB,non_coding_transcript_exon_variant,,ENST00000485845,;DTNB,non_coding_transcript_exon_variant,,ENST00000479898,;DTNB,intron_variant,,ENST00000356599,;DTNB,intron_variant,,ENST00000482145,; T ENSG00000138101 ENST00000406818 Transcript missense_variant 1394/2474 1144/1884 382/627 V/M Gtg/Atg COSM3839175,COSM3839176 1 DTNB HGNC 3058 protein_coding YES CCDS46237.1 ENSP00000384084 DTNB_HUMAN Q53TC8_HUMAN,Q53T51_HUMAN,Q53SF9_HUMAN,Q53QV1_HUMAN,F8W9U0_HUMAN,E9PE76_HUMAN,E7ES64_HUMAN UPI0000129949 NM_001256303.1,NM_021907.4 deleterious(0.03) benign(0.379) 11/21 hmmpanther:PTHR11915:SF227,hmmpanther:PTHR11915,PIRSF_domain:PIRSF038204 1,1 MODERATE 1 SNV 1,1 2 25678299 25678299 C T SNP DTNB ENST00000406818 human ensembl 69_37n -1 known missense c.1144 p.V382M 1000 pirsf_Distrobrevin pfam_EF-hand_dom_typ1,pfam_EF-hand_dom_typ2,pfam_Znf_ZZ,smart_Znf_ZZ,pirsf_Distrobrevin,pfscan_Znf_ZZ - no_errors DTNB HGNC ENSG00000138101 35 0 0 9 9 50 - - - 2 25678299 25678299 C T SNP DTNB ENST00000406818 human ensembl 69_37n -1 known missense c.1144 p.V382M 1000 pirsf_Distrobrevin pfam_EF-hand_dom_typ1,pfam_EF-hand_dom_typ2,pfam_Znf_ZZ,smart_Znf_ZZ,pirsf_Distrobrevin,pfscan_Znf_ZZ - no_errors DTNB HGNC - - - Putative_Passenger Test passenger Class 1 Class annotation +TP53 genome.wustl.edu GRCh37 17 7578253 7578253 0 Missense_Mutation SNP C C A TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A C Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx medium 3.005 getma.org/?cm=msa&ty=f&p=P53_HUMAN&rb=95&re=289&var=G199V getma.org/pdb.php?prot=P53_HUMAN&from=95&to=289&var=G199V getma.org/?cm=var&var=hg19,17,7578253,C,A&fts=all ENST00000269305.4:c.596G>T p.Gly199Val p.G199V ENST00000269305 11-Jun 0 0 TP53,missense_variant,p.Gly199Val,ENST00000420246,NM_001126114.2,NM_001276696.1;TP53,missense_variant,p.Gly199Val,ENST00000455263,NM_001276695.1,NM_001126113.2;TP53,missense_variant,p.Gly199Val,ENST00000269305,NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_001126118.1;TP53,missense_variant,p.Gly199Val,ENST00000445888,;TP53,missense_variant,p.Gly199Val,ENST00000359597,;TP53,missense_variant,p.Gly199Val,ENST00000413465,;TP53,missense_variant,p.Gly67Val,ENST00000509690,;TP53,missense_variant,p.Gly106Val,ENST00000514944,;TP53,downstream_gene_variant,,ENST00000508793,;TP53,downstream_gene_variant,,ENST00000604348,;TP53,downstream_gene_variant,,ENST00000503591,;TP53,upstream_gene_variant,,ENST00000576024,;TP53,intron_variant,,ENST00000574684,;TP53,non_coding_transcript_exon_variant,,ENST00000510385,;TP53,non_coding_transcript_exon_variant,,ENST00000504290,;TP53,non_coding_transcript_exon_variant,,ENST00000504937,;TP53,non_coding_transcript_exon_variant,,ENST00000505014,; A ENSG00000141510 ENST00000269305 Transcript missense_variant 786/2579 596/1182 199 G/V gGa/gTa TP53_g.12665G>T,COSM44140,COSM255788,COSM255787,COSM255789,COSM3675525,COSM3675524,COSM255790 1 TP53 HGNC 11998 protein_coding YES CCDS11118.1 ENSP00000269305 P53_HUMAN S5LQU8_HUMAN,Q761V2_HUMAN,Q6IT77_HUMAN,Q1HGV1_HUMAN,Q0PKT5_HUMAN,L0ES54_HUMAN,L0EQ05_HUMAN,K7PPA8_HUMAN,H2EHT1_HUMAN,G4Y083_HUMAN,E9PCY9_HUMAN,E7ESS1_HUMAN,E7EMR6_HUMAN,B5AKF6_HUMAN,B4DNI2_HUMAN,A4GWD0_HUMAN,A4GWB8_HUMAN,A4GWB5_HUMAN,A4GW97_HUMAN,A4GW76_HUMAN,A4GW75_HUMAN,A4GW74_HUMAN,A4GW67_HUMAN,A2I9Z1_HUMAN,A2I9Z0_HUMAN UPI000002ED67 NM_001126112.2 deleterious(0) probably_damaging(1) 11-Jun Gene3D:2.60.40.720,Pfam_domain:PF00870,hmmpanther:PTHR11447,hmmpanther:PTHR11447:SF6,Superfamily_domains:SSF49417 0,1,1,1,1,1,1,1 MODERATE 1 SNV 0,1,1,1,1,1,1,1 17 7578253 7578253 C A SNP TP53 NM_000546.4 human genbank 58_37c -1 reviewed missense c.596 p.G199V 1 HMMPfam_P53|7Csuperfamily_p53-like transcription factors HMMPfam_P53_TAD|7CHMMPfam_P53|7Csuperfamily_p53-like transcription factors|7CPatternScan_P53|7CHMMPfam_P53_tetramer|7Csuperfamily_p53 tetramerization domain - 17 7578253 7578253 C A SNP TP53 NM_000546.4 human genbank 58_37c -1 reviewed missense c.596 p.G199V 1 HMMPfam_P53|7Csuperfamily_p53-like transcription factors HMMPfam_P53_TAD|7CHMMPfam_P53|7Csuperfamily_p53-like transcription factors|7CPatternScan_P53|7CHMMPfam_P53_tetramer|7Csuperfamily_p53 tetramerization domain - +TP53 genome.wustl.edu GRCh37 17 7576851 7576851 0 Splice_Site SNP A A C novel unknown TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A A Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx ENST00000269305.4:c.993+2T>G p.X331_splice ENST00000269305 0 0 TP53,splice_donor_variant,,ENST00000420246,NM_001126114.2,NM_001276696.1;TP53,splice_donor_variant,,ENST00000455263,NM_001276695.1,NM_001126113.2;TP53,splice_donor_variant,,ENST00000269305,NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_001126118.1;TP53,splice_donor_variant,,ENST00000445888,;TP53,splice_donor_variant,,ENST00000359597,;TP53,splice_donor_variant,,ENST00000576024,;TP53,intron_variant,,ENST00000413465,;TP53,downstream_gene_variant,,ENST00000509690,;TP53,downstream_gene_variant,,ENST00000508793,;TP53,downstream_gene_variant,,ENST00000604348,;TP53,downstream_gene_variant,,ENST00000503591,;TP53,downstream_gene_variant,,ENST00000514944,;TP53,downstream_gene_variant,,ENST00000574684,;TP53,splice_donor_variant,,ENST00000510385,;TP53,splice_donor_variant,,ENST00000504290,;TP53,splice_donor_variant,,ENST00000504937,;TP53,downstream_gene_variant,,ENST00000505014,; C ENSG00000141510 ENST00000269305 Transcript splice_donor_variant -/2579 993/1182 TP53_g.14067T>G,COSM29774,COSM146229 1 TP53 HGNC 11998 protein_coding YES CCDS11118.1 ENSP00000269305 P53_HUMAN S5LQU8_HUMAN,Q761V2_HUMAN,Q6IT77_HUMAN,Q1HGV1_HUMAN,Q0PKT5_HUMAN,L0ES54_HUMAN,L0EQ05_HUMAN,K7PPA8_HUMAN,H2EHT1_HUMAN,G4Y083_HUMAN,E9PCY9_HUMAN,E7ESS1_HUMAN,E7EMR6_HUMAN,B5AKF6_HUMAN,B4DNI2_HUMAN,A4GWD0_HUMAN,A4GWB8_HUMAN,A4GWB5_HUMAN,A4GW97_HUMAN,A4GW76_HUMAN,A4GW75_HUMAN,A4GW74_HUMAN,A4GW67_HUMAN,A2I9Z1_HUMAN,A2I9Z0_HUMAN UPI000002ED67 NM_001126112.2 10-Sep 0,1,1 HIGH 1 SNV 0,1,1 17 7576851 7576851 A C SNP TP53 NM_000546 human genbank 57_37b -1 reviewed splice_site c.993+2 e8+2 1 - - - 17 7576851 7576851 A C SNP TP53 NM_000546 human genbank 57_37b -1 reviewed splice_site c.993+2 e8+2 1 - - - +BRCA1 genome.wustl.edu GRCh37 17 41243581 41243581 0 Nonsense_Mutation SNP G G A rs80357262 TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A G Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx 0 getma.org/?cm=var&var=hg19,17,41243581,G,A&fts=all ENST00000357654.3:c.3967C>T p.Gln1323Ter p.Q1323* ENST00000357654 23-Oct 0 0 BRCA1,stop_gained,p.Gln1027Ter,ENST00000309486,NM_007297.3;BRCA1,stop_gained,p.Gln1323Ter,ENST00000357654,NM_007294.3;BRCA1,stop_gained,p.Gln1323Ter,ENST00000346315,;BRCA1,stop_gained,p.Gln1323Ter,ENST00000354071,;BRCA1,stop_gained,p.Gln1323Ter,ENST00000471181,NM_007300.3;BRCA1,stop_gained,p.Gln1276Ter,ENST00000493795,;BRCA1,stop_gained,p.Gln88Ter,ENST00000461574,;BRCA1,intron_variant,,ENST00000352993,;BRCA1,intron_variant,,ENST00000351666,;BRCA1,intron_variant,,ENST00000468300,NM_007299.3;BRCA1,intron_variant,,ENST00000491747,NM_007298.3;BRCA1,intron_variant,,ENST00000478531,;BRCA1,intron_variant,,ENST00000493919,;BRCA1,intron_variant,,ENST00000484087,;BRCA1,intron_variant,,ENST00000591534,;BRCA1,intron_variant,,ENST00000487825,;BRCA1,intron_variant,,ENST00000586385,;BRCA1,intron_variant,,ENST00000591849,;BRCA1,downstream_gene_variant,,ENST00000470026,;BRCA1,downstream_gene_variant,,ENST00000477152,;BRCA1,downstream_gene_variant,,ENST00000494123,;BRCA1,downstream_gene_variant,,ENST00000473961,;BRCA1,downstream_gene_variant,,ENST00000497488,;BRCA1,downstream_gene_variant,,ENST00000476777,;BRCA1,3_prime_UTR_variant,,ENST00000461221,;BRCA1,non_coding_transcript_exon_variant,,ENST00000467274,;BRCA1,downstream_gene_variant,,ENST00000492859,;BRCA1,downstream_gene_variant,,ENST00000412061,; A ENSG00000012048 ENST00000357654 Transcript stop_gained 4086/7094 3967/5592 1323 Q/* Caa/Taa rs80357262 1 BRCA1 HGNC 1100 protein_coding CCDS11453.1 ENSP00000350283 BRCA1_HUMAN Q9UE29_HUMAN,Q9NQR3_HUMAN,Q92897_HUMAN,Q7KYU6_HUMAN,Q4EW25_HUMAN,Q3YB53_HUMAN,Q3YB50_HUMAN,Q3YB49_HUMAN,Q3LRH8_HUMAN,Q3B891_HUMAN,K7EPC7_HUMAN,K4K7V3_HUMAN,K4JXS7_HUMAN,K4JUB1_HUMAN,G4V503_HUMAN,G4V502_HUMAN,G4V500_HUMAN,G4V4Z8_HUMAN,G4V4Z7_HUMAN,G1UI37_HUMAN,E9PFZ0_HUMAN,E7EWN5_HUMAN,E7EP70_HUMAN,C9IZW4_HUMAN,C4PFY7_HUMAN UPI0000126AC8 NM_007294.3 23-Oct PIRSF_domain:PIRSF001734,hmmpanther:PTHR13763,hmmpanther:PTHR13763:SF0 not_provided,pathogenic HIGH SNV 1 17 41243581 41243581 G A SNP BRCA1 NM_007294.3 human genbank 58_37c -1 reviewed nonsense c.3967 p.Q1323* 0.949 HMMPfam_BRCT|7CHMMSmart_SM00292|7Csuperfamily_BRCT domain|7CHMMSmart_SM00184|7CPatternScan_ZF_RING_1|7CHMMPfam_zf-C3HC4|7Csuperfamily_RING/U-box - 17 41243581 41243581 G A SNP BRCA1 NM_007294.3 human genbank 58_37c -1 reviewed nonsense c.3967 p.Q1323* 0.949 HMMPfam_BRCT|7CHMMSmart_SM00292|7Csuperfamily_BRCT domain|7CHMMSmart_SM00184|7CPatternScan_ZF_RING_1|7CHMMPfam_zf-C3HC4|7Csuperfamily_RING/U-box - +BRCA1 genome.wustl.edu GRCh37 17 41201181 41201181 0 Missense_Mutation SNP C C A rs80357069 byCluster TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 C C Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx medium 2.25 getma.org/?cm=msa&ty=f&p=BRCA1_HUMAN&rb=1756&re=1842&var=G1788V getma.org/pdb.php?prot=BRCA1_HUMAN&from=1756&to=1842&var=G1788V getma.org/?cm=var&var=hg19,17,41201181,C,A&fts=all ENST00000357654.3:c.5363G>T p.Gly1788Val p.G1788V ENST00000357654 21/23 0 0 BRCA1,missense_variant,p.Gly1492Val,ENST00000309486,NM_007297.3;BRCA1,missense_variant,p.Gly1788Val,ENST00000357654,NM_007294.3;BRCA1,missense_variant,p.Gly1549Val,ENST00000346315,;BRCA1,missense_variant,p.Gly1523Val,ENST00000354071,;BRCA1,missense_variant,p.Gly1809Val,ENST00000471181,NM_007300.3;BRCA1,missense_variant,p.Gly1741Val,ENST00000493795,;BRCA1,missense_variant,p.Gly646Val,ENST00000352993,;BRCA1,missense_variant,p.Gly605Val,ENST00000351666,;BRCA1,missense_variant,p.Gly684Val,ENST00000491747,NM_007298.3;BRCA1,missense_variant,p.Gly279Val,ENST00000591534,;BRCA1,missense_variant,p.Gly98Val,ENST00000586385,;BRCA1,missense_variant,p.Gly21Val,ENST00000591849,;BRCA1,intron_variant,,ENST00000468300,NM_007299.3;BRCA1,3_prime_UTR_variant,,ENST00000461221,; A ENSG00000012048 ENST00000357654 Transcript missense_variant 5482/7094 5363/5592 1788 G/V gGt/gTt rs80357069,COSM436662 1 BRCA1 HGNC 1100 protein_coding CCDS11453.1 ENSP00000350283 BRCA1_HUMAN Q9UE29_HUMAN,Q9NQR3_HUMAN,Q92897_HUMAN,Q7KYU6_HUMAN,Q4EW25_HUMAN,Q3YB53_HUMAN,Q3YB50_HUMAN,Q3YB49_HUMAN,Q3LRH8_HUMAN,Q3B891_HUMAN,K7EPC7_HUMAN,K4K7V3_HUMAN,K4JXS7_HUMAN,K4JUB1_HUMAN,G4V503_HUMAN,G4V502_HUMAN,G4V500_HUMAN,G4V4Z8_HUMAN,G4V4Z7_HUMAN,G1UI37_HUMAN,E9PFZ0_HUMAN,E7EWN5_HUMAN,E7EP70_HUMAN,C9IZW4_HUMAN,C4PFY7_HUMAN UPI0000126AC8 NM_007294.3 deleterious(0) benign(0.031) 21/23 Gene3D:3.40.50.10190,Pfam_domain:PF00533,PIRSF_domain:PIRSF001734,Prints_domain:PR00493,PROSITE_profiles:PS50172,hmmpanther:PTHR13763,hmmpanther:PTHR13763:SF0,SMART_domains:SM00292,Superfamily_domains:SSF52113 not_provided,pathogenic 0,1 MODERATE SNV 1,1 17 41201181 41201181 C A SNP BRCA1 NM_007294 human genbank 57_37b -1 reviewed missense c.5363 p.G1788V 1 HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain superfamily_RING/U-box,HMMSmart_SM00184,HMMPfam_zf-C3HC4,PatternScan_ZF_RING_1,HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain - 17 41201181 41201181 C A SNP BRCA1 NM_007294 human genbank 57_37b -1 reviewed missense c.5363 p.G1788V 1 HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain superfamily_RING/U-box,HMMSmart_SM00184,HMMPfam_zf-C3HC4,PatternScan_ZF_RING_1,HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain - +ATM genome.wustl.edu GRCh37 11 108173702 108173702 0 Frame_Shift_Del DEL G G - TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 - G Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx ENST00000278616.4:c.5443delG p.Asp1815ThrfsTer13 p.D1815Tfs*13 ENST00000278616 36/63 0 0 ATM,frameshift_variant,p.Asp1815ThrfsTer13,ENST00000278616,NM_000051.3;ATM,frameshift_variant,p.Asp1815ThrfsTer13,ENST00000452508,;ATM,non_coding_transcript_exon_variant,,ENST00000524792,;ATM,non_coding_transcript_exon_variant,,ENST00000533690,;ATM,non_coding_transcript_exon_variant,,ENST00000534625,;ATM,upstream_gene_variant,,ENST00000529588,; - ENSG00000149311 ENST00000278616 Transcript frameshift_variant 5827/13147 5442/9171 1814 L/X ttG/tt rs772138812 1 ATM HGNC 795 protein_coding YES CCDS31669.1 ENSP00000278616 ATM_HUMAN M0QXY8_HUMAN,E9PRG7_HUMAN,E9PIN0_HUMAN UPI0000DBEF44 NM_000051.3 36/63 hmmpanther:PTHR11139,hmmpanther:PTHR11139:SF66 HIGH 1 deletion 1 11 108173702 108173702 G - DEL ATM NM_000051.3 human genbank 58_37c 1 reviewed frame_shift_del c.5442 p.D1815fs 1 superfamily_ARM repeat superfamily_ARM repeat|7CHMMPfam_FAT|7Csuperfamily_Protein kinase-like (PK-like)|7CHMMPfam_PI3_PI4_kinase|7CHMMSmart_SM00146|7CPatternScan_PI3_4_KINASE_1|7CPatternScan_PI3_4_KINASE_2|7CHMMPfam_FATC - 11 108173702 108173702 G - DEL ATM NM_000051.3 human genbank 58_37c 1 reviewed frame_shift_del c.5442 p.D1815fs 1 superfamily_ARM repeat superfamily_ARM repeat|7CHMMPfam_FAT|7Csuperfamily_Protein kinase-like (PK-like)|7CHMMPfam_PI3_PI4_kinase|7CHMMSmart_SM00146|7CPatternScan_PI3_4_KINASE_1|7CPatternScan_PI3_4_KINASE_2|7CHMMPfam_FATC - +ATM genome.wustl.edu GRCh37 11 108106472 108106472 0 Frame_Shift_Del DEL T T - novel unknown TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx ENST00000278616.4:c.409delT p.Tyr137ThrfsTer16 p.Y137Tfs*16 ENST00000278616 May-63 0 0 ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000278616,NM_000051.3;ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000452508,;ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000527805,;ATM,intron_variant,,ENST00000527891,;ATM,downstream_gene_variant,,ENST00000601453,;ATM,non_coding_transcript_exon_variant,,ENST00000530958,; - ENSG00000149311 ENST00000278616 Transcript frameshift_variant 792/13147 407/9171 136 I/X aTt/at COSM428356,COSM1474979 1 ATM HGNC 795 protein_coding YES CCDS31669.1 ENSP00000278616 ATM_HUMAN M0QXY8_HUMAN,E9PRG7_HUMAN,E9PIN0_HUMAN UPI0000DBEF44 NM_000051.3 May-63 Pfam_domain:PF11640,hmmpanther:PTHR11139,hmmpanther:PTHR11139:SF66 1,1 HIGH 1 deletion 2 1,1 11 108106472 108106472 T - DEL ATM NM_000051 human genbank 57_37b 1 reviewed frame_shift_del c.407 p.Y137fs 0.013 superfamily_ARM repeat,HMMPfam_FAT,superfamily_Protein kinase-like (PK-like),HMMPfam_PI3_PI4_kinase,HMMSmart_SM00146,PatternScan_PI3_4_KINASE_1,PatternScan_PI3_4_KINASE_2,HMMPfam_FATC (deletion:cds_exon[108106397,108106561]) 11 108106472 108106472 T - DEL ATM NM_000051 human genbank 57_37b 1 reviewed frame_shift_del c.407 p.Y137fs 0.013 superfamily_ARM repeat,HMMPfam_FAT,superfamily_Protein kinase-like (PK-like),HMMPfam_PI3_PI4_kinase,HMMSmart_SM00146,PatternScan_PI3_4_KINASE_1,PatternScan_PI3_4_KINASE_2,HMMPfam_FATC (deletion:cds_exon[108106397,108106561]) +KAT2A genome.wustl.edu GRCh37 17 40272381 40272381 -1 Silent SNP G G A TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 G G Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.471C>T p.= p.H157H ENST00000225916 3/18 0 0 KAT2A,synonymous_variant,p.=,ENST00000225916,NM_021078.2;CTD-2132N18.3,synonymous_variant,p.=,ENST00000592574,;RAB5C,downstream_gene_variant,,ENST00000393860,NM_201434.2;RAB5C,downstream_gene_variant,,ENST00000346213,NM_004583.3;HSPB9,upstream_gene_variant,,ENST00000355067,NM_033194.2;CTD-2132N18.3,missense_variant,p.Thr150Met,ENST00000592248,;KAT2A,synonymous_variant,p.=,ENST00000465682,;CTD-2132N18.3,3_prime_UTR_variant,,ENST00000585562,;KAT2A,upstream_gene_variant,,ENST00000592310,;KAT2A,upstream_gene_variant,,ENST00000588759,; A ENSG00000108773 ENST00000225916 Transcript synonymous_variant 525/3109 471/2514 157/837 H caC/caT rs536716483,COSM1479581 1 KAT2A HGNC 4201 protein_coding YES CCDS11417.1 ENSP00000225916 KAT2A_HUMAN K7ERS6_HUMAN UPI000000D978 NM_021078.2 3/18 hmmpanther:PTHR22880:SF124,hmmpanther:PTHR22880,Pfam_domain:PF06466,PIRSF_domain:PIRSF003048 A:0.0002 A:0 A:0 A:0.001 A:0 A:0 0,1 LOW 1 SNV 0,1 17 40272381 40272381 G A SNP ENSG00000267261 ENST00000592248 human ensembl 69_37n -1 known missense c.449 p.T150M 486 smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho pfam_Small_GTPase,pfam_MIRO-like,pfam_Small_GTPase_ARF/SAR,pfam_Gtr1_RagA,smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho,prints_Small_GTPase,tigrfam_Small_GTP-bd_dom - no_errors CTD-2132N18.3 Clone_based_vega_gene ENSG00000267261 40 0 0 30 36 54.55 - - - 17 40272381 40272381 G A SNP ENSG00000267261 ENST00000592248 human ensembl 69_37n -1 known missense c.449 p.T150M 486 smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho pfam_Small_GTPase,pfam_MIRO-like,pfam_Small_GTPase_ARF/SAR,pfam_Gtr1_RagA,smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho,prints_Small_GTPase,tigrfam_Small_GTP-bd_dom - no_errors CTD-2132N18.3 Clone_based_vega_gene - - - Putative_Driver Test driver Class 1 Class annotation +MSH3 genome.wustl.edu GRCh37 5 80024722 80024722 1 Frame_Shift_Del DEL T T - TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.1508delT p.Leu503TrpfsTer5 p.L503Wfs*5 ENST00000265081 10/24 0 0 MSH3,frameshift_variant,p.Leu503TrpfsTer5,ENST00000265081,NM_002439.4;MSH3,non_coding_transcript_exon_variant,,ENST00000512258,; - ENSG00000113318 ENST00000265081 Transcript frameshift_variant 1586/4092 1506/3414 502/1137 S/X tcT/tc 1 MSH3 HGNC 7326 protein_coding YES CCDS34195.1 ENSP00000265081 MSH3_HUMAN UPI0000DBEE85 NM_002439.4 10/24 Superfamily_domains:SSF53150,Gene3D:3.30.420.110,Pfam_domain:PF05188,hmmpanther:PTHR11361,hmmpanther:PTHR11361:SF34 HIGH 1 deletion 2 5 80024722 80024722 T - DEL MSH3 ENST00000265081 human ensembl 69_37n 1 known frame_shift_del c.1506 p.L503fs 998 pfam_DNA_mismatch_repair_MutS_connt,superfamily_DNA_mismatch_repair_MutS_connt pfam_DNA_mismatch_repair_MutS_C,pfam_DNA_mismatch_repair_MutS_core,pfam_DNA_mismatch_repair_MutS_connt,pfam_DNA_mismatch_repair_MutS-lik_N,pfam_DNA_mismatch_repair_MutS_clamp,superfamily_DNA_mismatch_repair_MutS_core,superfamily_DNA_mismatch_repair_MutS_N,superfamily_DNA_mismatch_repair_MutS_connt,smart_DNA_mismatch_repair_MutS_core,smart_DNA_mismatch_repair_MutS_C - no_errors MSH3 HGNC ENSG00000113318 83 0 0 12 2 14.29 - - - 5 80024722 80024722 T - DEL MSH3 ENST00000265081 human ensembl 69_37n 1 known frame_shift_del c.1506 p.L503fs 998 pfam_DNA_mismatch_repair_MutS_connt,superfamily_DNA_mismatch_repair_MutS_connt pfam_DNA_mismatch_repair_MutS_C,pfam_DNA_mismatch_repair_MutS_core,pfam_DNA_mismatch_repair_MutS_connt,pfam_DNA_mismatch_repair_MutS-lik_N,pfam_DNA_mismatch_repair_MutS_clamp,superfamily_DNA_mismatch_repair_MutS_core,superfamily_DNA_mismatch_repair_MutS_N,superfamily_DNA_mismatch_repair_MutS_connt,smart_DNA_mismatch_repair_MutS_core,smart_DNA_mismatch_repair_MutS_C - no_errors MSH3 HGNC - - - Putative_Passenger Test passenger Class 3 Class annotation +MYB genome.wustl.edu GRCh37 6 135507043 135507044 1 Frame_Shift_Ins INS - - A TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 - - Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.27dupA p.Tyr10IlefsTer2 p.Y10Ifs*2 ENST00000367814 2/15 0 0 MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000341911,NM_001130173.1,NM_001161658.1,NM_001161656.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000316528,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000442647,NM_001161660.1,NM_001130172.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000367814,NM_001161659.1,NM_005375.2;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525369,NM_001161657.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000527615,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528774,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000534121,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533624,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000534044,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000420123,;MYB,upstream_gene_variant,,ENST00000430686,;MYB,non_coding_transcript_exon_variant,,ENST00000531845,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000367812,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533837,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000438901,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525477,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000463282,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000339290,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533808,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525514,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000529586,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526889,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526320,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000531519,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533384,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000531737,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000529262,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526565,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528015,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526187,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525002,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528343,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528140,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528345,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525940,;MYB,frameshift_variant,p.Tyr10Ter,ENST00000531634,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000524588,; A ENSG00000118513 ENST00000367814 Transcript frameshift_variant 212-213/3302 26-27/1923 9/640 I/IX ata/atAa COSM1487247,COSM1487248 1 MYB HGNC 7545 protein_coding CCDS5174.1 ENSP00000356788 MYB_HUMAN Q9UMI7_HUMAN,Q708J0_HUMAN,Q708E9_HUMAN,Q708E3_HUMAN UPI000012FAEA NM_001161659.1,NM_005375.2 2/15 hmmpanther:PTHR10641,hmmpanther:PTHR10641:SF454 1,1 HIGH insertion 1 1,1 6 135507043 135507044 - A INS MYB ENST00000341911 human ensembl 69_37n 1 known frame_shift_ins c.26_27 p.Y10fs 1.000:0.997 pfam_C-myb_C,pfam_SANT/Myb,pfam_Tscrpt_reg_Wos2-domain,superfamily_Homeodomain-like,smart_SANT/Myb,pfscan_Myb-like_dom - no_errors MYB HGNC ENSG00000118513 50 0 0 36 4 10 - - - 6 135507043 135507044 - A INS MYB ENST00000341911 human ensembl 69_37n 1 known frame_shift_ins c.26_27 p.Y10fs 1.000:0.997 pfam_C-myb_C,pfam_SANT/Myb,pfam_Tscrpt_reg_Wos2-domain,superfamily_Homeodomain-like,smart_SANT/Myb,pfscan_Myb-like_dom - no_errors MYB HGNC - - - Putative_Passenger Test passenger +PIEZO1 genome.wustl.edu GRCh37 16 88790292 88790292 -1 Missense_Mutation SNP T T C TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx low 1.18 Q1441R getma.org/?cm=msa&ty=f&p=PIEZ1_HUMAN&rb=58&re=1627&var=Q1441R getma.org/?cm=var&var=hg19,16,88790292,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.4322A>G p.Gln1441Arg p.Q1441R ENST00000301015 31/51 0 0 PIEZO1,missense_variant,p.Gln1441Arg,ENST00000301015,NM_001142864.2;PIEZO1,missense_variant,p.Gln115Arg,ENST00000474606,;PIEZO1,upstream_gene_variant,,ENST00000327397,;PIEZO1,upstream_gene_variant,,ENST00000466823,;RP5-1142A6.9,downstream_gene_variant,,ENST00000564984,;PIEZO1,non_coding_transcript_exon_variant,,ENST00000566414,;PIEZO1,upstream_gene_variant,,ENST00000419505,;PIEZO1,upstream_gene_variant,,ENST00000497793,;PIEZO1,upstream_gene_variant,,ENST00000495568,;PIEZO1,downstream_gene_variant,,ENST00000475586,;PIEZO1,downstream_gene_variant,,ENST00000491917,; C ENSG00000103335 ENST00000301015 Transcript missense_variant 4569/8072 4322/7566 1441/2521 Q/R cAg/cGg COSM1479166 1 PIEZO1 HGNC 28993 protein_coding YES CCDS54058.1 ENSP00000301015 PIEZ1_HUMAN UPI0001B300F3 NM_001142864.2 tolerated(0.25) possibly_damaging(0.78) 31/51 hmmpanther:PTHR13167,hmmpanther:PTHR13167:SF40 1 MODERATE 1 SNV 1 16 88790292 88790292 T C SNP PIEZO1 ENST00000301015 human ensembl 69_37n -1 novel missense c.4322 p.Q1441R 1000 pfam_DUF3595 - no_errors PIEZO1 HGNC ENSG00000103335 37 0 0 20 8 28.57 - - - 16 88790292 88790292 T C SNP PIEZO1 ENST00000301015 human ensembl 69_37n -1 novel missense c.4322 p.Q1441R 1000 pfam_DUF3595 - no_errors PIEZO1 HGNC - - - Putative_Passenger Test passenger Class 3 Class annotation +BRCA2 genome.wustl.edu GRCh37 13 108106473 108106473 0 Nonsense_Mutation SNP T T C TCGA-BH-NEW-01 TCGA-BH-NEW-10 Germline p.D191G BRCA2_HUMAN +BRCA2 genome.wustl.edu GRCh37 13 108106474 108106474 0 Nonsense_Mutation SNP T T C TCGA-BH-NEW-01 TCGA-BH-NEW-10 Somatic p.D191G BRCA2_HUMAN +BRCA2 genome.wustl.edu GRCh37 13 108106475 108106475 0 In_Frame_Del DEL T T - TCGA-A1-A0SK-01 TCGA-A1-A0SK-10 Germline p.R2659T BRCA2_HUMAN diff --git a/src/test/scripts/test_data/study_es_0_inc/meta_clinical_patients.txt b/src/test/scripts/test_data/study_es_0_inc/meta_clinical_patients.txt new file mode 100644 index 00000000..5ff93a44 --- /dev/null +++ b/src/test/scripts/test_data/study_es_0_inc/meta_clinical_patients.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: CLINICAL +datatype: PATIENT_ATTRIBUTES +data_filename: data_clinical_patients.txt diff --git a/src/test/scripts/test_data/study_es_0_inc/meta_clinical_samples.txt b/src/test/scripts/test_data/study_es_0_inc/meta_clinical_samples.txt new file mode 100644 index 00000000..7e4f6741 --- /dev/null +++ b/src/test/scripts/test_data/study_es_0_inc/meta_clinical_samples.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: CLINICAL +datatype: SAMPLE_ATTRIBUTES +data_filename: data_clinical_samples.txt diff --git a/src/test/scripts/test_data/study_es_0_inc/meta_mutations_extended.txt b/src/test/scripts/test_data/study_es_0_inc/meta_mutations_extended.txt new file mode 100644 index 00000000..94df92aa --- /dev/null +++ b/src/test/scripts/test_data/study_es_0_inc/meta_mutations_extended.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: MUTATION_EXTENDED +datatype: MAF +stable_id: mutations +show_profile_in_analysis_tab: true +profile_description: Mutation data from whole exome sequencing. +profile_name: Mutations +data_filename: data_mutations_extended.maf +swissprot_identifier: name +namespaces: Zygosity From b5952e3df8387a181aa237db8f05349dda51f600 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Mon, 8 Apr 2024 21:30:02 +0200 Subject: [PATCH 020/130] Rename and move incremental tests to incementalTest folder --- .../incremental/TestIncrementalMutationsImport.java} | 4 ++-- .../incremental/TestIncrementalPatientsImport.java} | 5 ++--- .../incremental/TestIncrementalSamplesImport.java} | 4 ++-- .../incremental/TestUpdateCaseListsSampleIds.java} | 4 ++-- 4 files changed, 8 insertions(+), 9 deletions(-) rename src/test/java/org/mskcc/cbio/portal/{scripts/incremental/ITIncrementalMutationsImport.java => integrationTest/incremental/TestIncrementalMutationsImport.java} (98%) rename src/test/java/org/mskcc/cbio/portal/{scripts/incremental/ITIncrementalPatientsImport.java => integrationTest/incremental/TestIncrementalPatientsImport.java} (97%) rename src/test/java/org/mskcc/cbio/portal/{scripts/incremental/ITIncrementalSamplesImport.java => integrationTest/incremental/TestIncrementalSamplesImport.java} (98%) rename src/test/java/org/mskcc/cbio/portal/{scripts/incremental/ITUpdateCaseListsSampleIds.java => integrationTest/incremental/TestUpdateCaseListsSampleIds.java} (98%) diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalMutationsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java similarity index 98% rename from src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalMutationsImport.java rename to src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java index bf1f8be0..7293d57a 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalMutationsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java @@ -30,7 +30,7 @@ * along with this program. If not, see . */ -package org.mskcc.cbio.portal.scripts.incremental; +package org.mskcc.cbio.portal.integrationTest.incremental; import org.junit.Before; import org.junit.Test; @@ -62,7 +62,7 @@ @ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) @Rollback @Transactional -public class ITIncrementalMutationsImport { +public class TestIncrementalMutationsImport { public static final String STUDY_ID = "study_tcga_pub"; private CancerStudy cancerStudy; diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java similarity index 97% rename from src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java rename to src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java index 4ca89e21..d1b1b1ca 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalPatientsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java @@ -30,7 +30,7 @@ * along with this program. If not, see . */ -package org.mskcc.cbio.portal.scripts.incremental; +package org.mskcc.cbio.portal.integrationTest.incremental; import org.junit.Before; import org.junit.Test; @@ -44,7 +44,6 @@ import org.springframework.transaction.annotation.Transactional; import java.io.File; -import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.stream.Collectors; @@ -62,7 +61,7 @@ @ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) @Rollback @Transactional -public class ITIncrementalPatientsImport { +public class TestIncrementalPatientsImport { public static final String STUDY_ID = "study_tcga_pub"; private CancerStudy cancerStudy; diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java similarity index 98% rename from src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java rename to src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java index 45ba8e1b..de9bd451 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITIncrementalSamplesImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java @@ -30,7 +30,7 @@ * along with this program. If not, see . */ -package org.mskcc.cbio.portal.scripts.incremental; +package org.mskcc.cbio.portal.integrationTest.incremental; import org.junit.Before; import org.junit.Test; @@ -61,7 +61,7 @@ @ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) @Rollback @Transactional -public class ITIncrementalSamplesImport { +public class TestIncrementalSamplesImport { public static final String STUDY_ID = "study_tcga_pub"; private CancerStudy cancerStudy; diff --git a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java similarity index 98% rename from src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java rename to src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java index a98663b4..21432d1b 100644 --- a/src/test/java/org/mskcc/cbio/portal/scripts/incremental/ITUpdateCaseListsSampleIds.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java @@ -30,7 +30,7 @@ * along with this program. If not, see . */ -package org.mskcc.cbio.portal.scripts.incremental; +package org.mskcc.cbio.portal.integrationTest.incremental; import org.junit.Before; import org.junit.Test; @@ -57,7 +57,7 @@ @ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) @Rollback @Transactional -public class ITUpdateCaseListsSampleIds { +public class TestUpdateCaseListsSampleIds { DaoSampleList daoSampleList = new DaoSampleList(); /** From 753119bc352fb31261915d4698d62404c38b6b82 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 9 Apr 2024 12:18:26 +0200 Subject: [PATCH 021/130] Update TODO comment how to deal with multiple sample files --- scripts/importer/cbioportalImporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index 8958c258..d1fdbb8a 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -477,7 +477,7 @@ def process_data_directory(jvm_args, data_directory, update_generic_assay_entity # TODO we could also validate not supported types after loading is done? if MetaFileTypes.SAMPLE_ATTRIBUTES in meta_file_type_to_meta_files: - # TODO Make meta attribute optional in command to update case lists + # TODO What if we have multiple clinical sample files? Throw exception or upload meta_filename, meta_dictionary = meta_file_type_to_meta_files[MetaFileTypes.SAMPLE_ATTRIBUTES][0] case_list_dirname = os.path.join(data_directory, 'case_lists') update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir=case_list_dirname if os.path.isdir(case_list_dirname) else None) From 5725d427f48353ecaad5bc52640c9c3b1dd4735d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 9 Apr 2024 20:41:34 +0200 Subject: [PATCH 022/130] Move study_es_0_inc to the new test data folder --- .../test_data/study_es_0_inc/case_lists/cases_sequenced.txt | 0 .../test_data/study_es_0_inc/data_clinical_patients.txt | 0 .../test_data/study_es_0_inc/data_clinical_samples.txt | 0 .../test_data/study_es_0_inc/data_mutations_extended.maf | 0 .../test_data/study_es_0_inc/meta_clinical_patients.txt | 0 .../test_data/study_es_0_inc/meta_clinical_samples.txt | 0 .../test_data/study_es_0_inc/meta_mutations_extended.txt | 0 7 files changed, 0 insertions(+), 0 deletions(-) rename {src/test/scripts => tests}/test_data/study_es_0_inc/case_lists/cases_sequenced.txt (100%) rename {src/test/scripts => tests}/test_data/study_es_0_inc/data_clinical_patients.txt (100%) rename {src/test/scripts => tests}/test_data/study_es_0_inc/data_clinical_samples.txt (100%) rename {src/test/scripts => tests}/test_data/study_es_0_inc/data_mutations_extended.maf (100%) rename {src/test/scripts => tests}/test_data/study_es_0_inc/meta_clinical_patients.txt (100%) rename {src/test/scripts => tests}/test_data/study_es_0_inc/meta_clinical_samples.txt (100%) rename {src/test/scripts => tests}/test_data/study_es_0_inc/meta_mutations_extended.txt (100%) diff --git a/src/test/scripts/test_data/study_es_0_inc/case_lists/cases_sequenced.txt b/tests/test_data/study_es_0_inc/case_lists/cases_sequenced.txt similarity index 100% rename from src/test/scripts/test_data/study_es_0_inc/case_lists/cases_sequenced.txt rename to tests/test_data/study_es_0_inc/case_lists/cases_sequenced.txt diff --git a/src/test/scripts/test_data/study_es_0_inc/data_clinical_patients.txt b/tests/test_data/study_es_0_inc/data_clinical_patients.txt similarity index 100% rename from src/test/scripts/test_data/study_es_0_inc/data_clinical_patients.txt rename to tests/test_data/study_es_0_inc/data_clinical_patients.txt diff --git a/src/test/scripts/test_data/study_es_0_inc/data_clinical_samples.txt b/tests/test_data/study_es_0_inc/data_clinical_samples.txt similarity index 100% rename from src/test/scripts/test_data/study_es_0_inc/data_clinical_samples.txt rename to tests/test_data/study_es_0_inc/data_clinical_samples.txt diff --git a/src/test/scripts/test_data/study_es_0_inc/data_mutations_extended.maf b/tests/test_data/study_es_0_inc/data_mutations_extended.maf similarity index 100% rename from src/test/scripts/test_data/study_es_0_inc/data_mutations_extended.maf rename to tests/test_data/study_es_0_inc/data_mutations_extended.maf diff --git a/src/test/scripts/test_data/study_es_0_inc/meta_clinical_patients.txt b/tests/test_data/study_es_0_inc/meta_clinical_patients.txt similarity index 100% rename from src/test/scripts/test_data/study_es_0_inc/meta_clinical_patients.txt rename to tests/test_data/study_es_0_inc/meta_clinical_patients.txt diff --git a/src/test/scripts/test_data/study_es_0_inc/meta_clinical_samples.txt b/tests/test_data/study_es_0_inc/meta_clinical_samples.txt similarity index 100% rename from src/test/scripts/test_data/study_es_0_inc/meta_clinical_samples.txt rename to tests/test_data/study_es_0_inc/meta_clinical_samples.txt diff --git a/src/test/scripts/test_data/study_es_0_inc/meta_mutations_extended.txt b/tests/test_data/study_es_0_inc/meta_mutations_extended.txt similarity index 100% rename from src/test/scripts/test_data/study_es_0_inc/meta_mutations_extended.txt rename to tests/test_data/study_es_0_inc/meta_mutations_extended.txt From 299466a4ed1b7ef1dbe283bc702011fc77a459c3 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 9 Apr 2024 23:29:37 +0200 Subject: [PATCH 023/130] Fix removing patient attributes on samples inc. upload --- .../cbio/portal/scripts/ImportClinicalData.java | 4 ++-- .../TestIncrementalPatientsImport.java | 8 ++++---- .../incremental/TestIncrementalSamplesImport.java | 15 ++++++++++++--- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java index a57bc0bb..37d79125 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java @@ -336,7 +336,7 @@ private boolean addDatum(String[] fields, List columnAttrs, M Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), stableSampleId, false); if (sample != null) { internalSampleId = sample.getInternalId(); - if (overwriteExisting) { + if (overwriteExisting && this.attributesType == AttributeTypes.SAMPLE_ATTRIBUTES) { DaoClinicalData.removeSampleData(internalSampleId); } else { //this should be a WARNING in case of TCGA studies (see https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415) @@ -358,7 +358,7 @@ private boolean addDatum(String[] fields, List columnAttrs, M if (patient != null) { //patient exists, get internal id: internalPatientId = patient.getInternalId(); - if (overwriteExisting) { + if (overwriteExisting && this.attributesType == AttributeTypes.PATIENT_ATTRIBUTES) { DaoClinicalData.removePatientData(internalPatientId); } } diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java index d1b1b1ca..f5e1884e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java @@ -89,12 +89,12 @@ public void testInsertNewPatient() throws DaoException { assertNotNull("Patient with id " + newPatientId + " has to be injected to the DB.", newPatient); List clinicalData = DaoClinicalData.getData(cancerStudy.getInternalId(), List.of(newPatientId)); - Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + Map patientAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); assertEquals(Map.of( "SUBTYPE", "basal-like", "OS_STATUS", "0:LIVING", "OS_MONTHS", "45.6", - "DFS_STATUS", "1:Recurred/Progressed"), sampleAttrs); + "DFS_STATUS", "1:Recurred/Progressed"), patientAttrs); } @Test @@ -122,11 +122,11 @@ public void testUpdatePatientAttributes() throws DaoException { assertNotNull("Patient with id " + updatedPatientId + " has to be injected to the DB.", newPatient); List clinicalData = DaoClinicalData.getData(cancerStudy.getInternalId(), List.of(updatedPatientId)); - Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + Map patientAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); assertEquals(Map.of( "SUBTYPE", "basal-like", "OS_MONTHS", "56.7", "DFS_STATUS", "1:Recurred/Progressed", - "DFS_MONTHS", "100"), sampleAttrs); + "DFS_MONTHS", "100"), patientAttrs); } } diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java index de9bd451..f6dfd0a3 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java @@ -79,8 +79,10 @@ public void testInsertNewSampleForExistingPatient() throws DaoException { /** * prepare a new patient without samples */ - Patient patient = new Patient(cancerStudy, "TEST-INC-TCGA-P1"); + String patientId = "TEST-INC-TCGA-P1"; + Patient patient = new Patient(cancerStudy, patientId); int internalPatientId = DaoPatient.addPatient(patient); + DaoClinicalData.addPatientDatum(internalPatientId, "OS_STATUS", "0:LIVING"); String newSampleId = "TEST-INC-TCGA-P1-S1"; File singleTcgaSampleFolder = new File("src/test/resources/incremental/insert_single_tcga_sample/"); @@ -99,13 +101,20 @@ public void testInsertNewSampleForExistingPatient() throws DaoException { Sample sample = samples.get(0); assertEquals(newSampleId, sample.getStableId()); - List clinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), List.of(newSampleId)); - Map sampleAttrs = clinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + List sampleClinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), List.of(newSampleId)); + Map sampleAttrs = sampleClinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); assertEquals(Map.of( "SUBTYPE", "basal-like", "OS_STATUS", "1:DECEASED", "OS_MONTHS", "12.34", "DFS_STATUS", "1:Recurred/Progressed"), sampleAttrs); + + // Patient attributes get SAMPLE_COUNT + List patientClinicalData = DaoClinicalData.getData(cancerStudy.getInternalId(), List.of(patientId)); + Map patientAttrs = patientClinicalData.stream().collect(Collectors.toMap(ClinicalData::getAttrId, ClinicalData::getAttrVal)); + assertEquals(Map.of( + "OS_STATUS", "0:LIVING", + "SAMPLE_COUNT", "1"), patientAttrs); } /** From c0c28e2b764edf1c60a0b35f6d8481faeff2bb24 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 13:41:53 +0200 Subject: [PATCH 024/130] Change study_es_0_inc to contain more diverse data We changed them to work for the demo. Mutation numbers did not change on demo. --- .../study_es_0_inc/data_clinical_patients.txt | 2 +- .../study_es_0_inc/data_mutations_extended.maf | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/test_data/study_es_0_inc/data_clinical_patients.txt b/tests/test_data/study_es_0_inc/data_clinical_patients.txt index c2679ddf..105e2fa4 100644 --- a/tests/test_data/study_es_0_inc/data_clinical_patients.txt +++ b/tests/test_data/study_es_0_inc/data_clinical_patients.txt @@ -3,5 +3,5 @@ #STRING STRING NUMBER STRING NUMBER #1 1 1 1 1 PATIENT_ID OS_STATUS OS_MONTHS DFS_STATUS DFS_MONTHS -TCGA-BH-A18K 1:DECEASED 96.74 1:Recurred/Progressed 36 +TCGA-BH-A18K 1:DECEASED 96.74 NA [Not Available] TCGA-BH-NEW 0:LIVING 2.37 0:DiseaseFree 2.37 diff --git a/tests/test_data/study_es_0_inc/data_mutations_extended.maf b/tests/test_data/study_es_0_inc/data_mutations_extended.maf index aa50f1d2..42bd026a 100644 --- a/tests/test_data/study_es_0_inc/data_mutations_extended.maf +++ b/tests/test_data/study_es_0_inc/data_mutations_extended.maf @@ -1,17 +1,17 @@ #version 2.4 Hugo_Symbol Entrez_Gene_Id Center NCBI_Build Chromosome Start_Position End_Position Strand Variant_Classification Variant_Type Reference_Allele Tumor_Seq_Allele1 Tumor_Seq_Allele2 dbSNP_RS dbSNP_Val_Status Tumor_Sample_Barcode Matched_Norm_Sample_Barcode Match_Norm_Seq_Allele1 Match_Norm_Seq_Allele2 Tumor_Validation_Allele1 Tumor_Validation_Allele2 Match_Norm_Validation_Allele1 Match_Norm_Validation_Allele2 Verification_Status Validation_Status Mutation_Status Sequencing_Phase Sequence_Source Validation_Method Score BAM_File Sequencer MA:FImpact MA:FIS Amino_Acid_Change MA:link.MSA MA:link.PDB MA:link.var Tumor_Sample_UUID Matched_Norm_Sample_UUID HGVSc HGVSp HGVSp_Short Transcript_ID Exon_Number t_depth t_ref_count t_alt_count n_depth n_ref_count n_alt_count all_effects Allele Gene Feature Feature_type Consequence cDNA_position CDS_position Protein_position Amino_acids Codons Existing_variation ALLELE_NUM DISTANCE SYMBOL SYMBOL_SOURCE HGNC_ID BIOTYPE CANONICAL CCDS ENSP SWISSPROT TREMBL UNIPARC RefSeq SIFT PolyPhen EXON INTRON DOMAINS GMAF AFR_MAF AMR_MAF ASN_MAF EAS_MAF EUR_MAF SAS_MAF AA_MAF EA_MAF CLIN_SIG SOMATIC PUBMED MOTIF_NAME MOTIF_POS HIGH_INF_POS MOTIF_SCORE_CHANGE IMPACT PICK VARIANT_CLASS TSL HGVS_OFFSET PHENO chromosome_name_wu start_wu stop_wu reference_wu variant_wu type_wu gene_name_wu transcript_name_wu transcript_species_wu transcript_source_wu transcript_version_wu strand_wu transcript_status_wu trv_type_wu c_position_wu amino_acid_change_wu ucsc_cons_wu domain_wu all_domains_wu deletion_substructures_wu transcript_error_wu default_gene_name_wu gene_name_source_wu ensembl_gene_id normal_ref_reads normal_var_reads normal_vaf tumor_ref_reads tumors_var_reads tumor_vaf evs_ea evs_aa evs_all chromosome_name_WU start_WU stop_WU reference_WU variant_WU type_WU gene_name_WU transcript_name_WU transcript_species_WU transcript_source_WU transcript_version_WU strand_WU transcript_status_WU trv_type_WU c_position_WU amino_acid_change_WU ucsc_cons_WU domain_WU all_domains_WU deletion_substructures_WU transcript_error_WU default_gene_name_WU gene_name_source_WU EVS_EA EVS_AA EVS_All cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation Zygosity.name Zygosity.code OR11H1 genome.wustl.edu GRCh37 22 16449539 16449539 -1 Missense_Mutation SNP A A G TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A A Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx low 1.49 V89A getma.org/?cm=msa&ty=f&p=O11H1_HUMAN&rb=1&re=154&var=V89A getma.org/?cm=var&var=hg19,22,16449539,A,G&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.266T>C p.Val89Ala p.V89A ENST00000252835 1/1 0 0 OR11H1,missense_variant,p.Val89Ala,ENST00000252835,NM_001005239.1; G ENSG00000130538 ENST00000252835 Transcript missense_variant 267/982 266/981 89/326 V/A gTc/gCc rs199856986,COSM1484040 1 OR11H1 HGNC 15404 protein_coding YES CCDS33594.1 ENSP00000252835 O11H1_HUMAN UPI000004B1CF NM_001005239.1 deleterious(0.02) possibly_damaging(0.589) 1/1 Transmembrane_helices:TMhelix,PROSITE_profiles:PS50262,hmmpanther:PTHR24242:SF201,hmmpanther:PTHR24242,Gene3D:1.20.1070.10,Superfamily_domains:SSF81321 0,1 MODERATE 1 SNV 0,1 22 16449539 16449539 A G SNP OR11H1 ENST00000252835 human ensembl 69_37n -1 known missense c.266 p.V89A 234 pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam,prints_Olfact_rcpt,prints_7TM_GPCR_Rhodpsn - no_errors OR11H1 HGNC ENSG00000130538 65 0 0 38 6 13.64 - - - 22 16449539 16449539 A G SNP OR11H1 ENST00000252835 human ensembl 69_37n -1 known missense c.266 p.V89A 234 pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam pfam_7TM_GPCR_Rhodpsn,pfscan_GPCR_Rhodpsn_supfam,prints_Olfact_rcpt,prints_7TM_GPCR_Rhodpsn - no_errors OR11H1 HGNC - - - Putative_Driver Test driver -TMEM247 genome.wustl.edu GRCh37 2 46707888 46707888 1 Frame_Shift_Del DEL G G - TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 G G Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.463delG p.Ala155ArgfsTer59 p.A155Rfs*59 ENST00000434431 2/3 0 0 TMEM247,frameshift_variant,p.Ala155ArgfsTer59,ENST00000434431,NM_001145051.2;TMEM247,intron_variant,,ENST00000432241,; - ENSG00000187600 ENST00000434431 Transcript frameshift_variant 462/659 462/659 154/219 E/X gaG/ga COSM1408208,~rs70940616 1 TMEM247 HGNC 42967 protein_coding YES CCDS56117.1 ENSP00000388684 TM247_HUMAN UPI0000366EF8 NM_001145051.2 2/3 Coiled-coils_(Ncoils):Coil,Pfam_domain:PF15444 -:0.0202 -:0.0439 1 HIGH 1 deletion 1 1 2 46707888 46707888 G - DEL TMEM247 ENST00000434431 human ensembl 69_37n 1 known frame_shift_del c.462 p.A155fs 83 - no_stop_codon:bad_bp_length_for_coding_region TMEM247 HGNC ENSG00000187600 20 0 0 7 3 30 - - - 2 46707888 46707888 G - DEL TMEM247 ENST00000434431 human ensembl 69_37n 1 known frame_shift_del c.462 p.A155fs 83 - no_stop_codon:bad_bp_length_for_coding_region TMEM247 HGNC - - - Putative_Passenger Test passenger Class 2 Class annotation +TMEM247 genome.wustl.edu GRCh37 2 46707888 46707888 1 Frame_Shift_Del DEL G G - TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 G G Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.463delG p.Ala155ArgfsTer59 p.A155Rfs*59 ENST00000434431 2/3 0 0 TMEM247,frameshift_variant,p.Ala155ArgfsTer59,ENST00000434431,NM_001145051.2;TMEM247,intron_variant,,ENST00000432241,; - ENSG00000187600 ENST00000434431 Transcript frameshift_variant 462/659 462/659 154/219 E/X gaG/ga COSM1408208,~rs70940616 1 TMEM247 HGNC 42967 protein_coding YES CCDS56117.1 ENSP00000388684 TM247_HUMAN UPI0000366EF8 NM_001145051.2 2/3 Coiled-coils_(Ncoils):Coil,Pfam_domain:PF15444 -:0.0202 -:0.0439 1 HIGH 1 deletion 1 1 2 46707888 46707888 G - DEL TMEM247 ENST00000434431 human ensembl 69_37n 1 known frame_shift_del c.462 p.A155fs 83 - no_stop_codon:bad_bp_length_for_coding_region TMEM247 HGNC ENSG00000187600 20 0 0 7 3 30 - - - 2 46707888 46707888 G - DEL TMEM247 ENST00000434431 human ensembl 69_37n 1 known frame_shift_del c.462 p.A155fs 83 - no_stop_codon:bad_bp_length_for_coding_region TMEM247 HGNC - - - Putative_Passenger Test passenger Class 2 Class annotation ABLIM1 genome.wustl.edu GRCh37 10 116247760 116247760 -1 Missense_Mutation SNP T C C TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 3.39 H333R getma.org/?cm=msa&ty=f&p=ABLM1_HUMAN&rb=285&re=339&var=H333R getma.org/pdb.php?prot=ABLM1_HUMAN&from=285&to=339&var=H333R getma.org/?cm=var&var=hg19,10,116247760,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.998A>G p.His333Arg p.H333R ENST00000277895 8/23 0 0 ABLIM1,missense_variant,p.His273Arg,ENST00000533213,;ABLIM1,missense_variant,p.His273Arg,ENST00000369252,NM_001003408.1,NM_001003407.1;ABLIM1,missense_variant,p.His17Arg,ENST00000392952,NM_006720.3;ABLIM1,missense_variant,p.His17Arg,ENST00000369266,;ABLIM1,missense_variant,p.His333Arg,ENST00000277895,NM_002313.5;ABLIM1,missense_variant,p.His17Arg,ENST00000369253,;ABLIM1,missense_variant,p.His17Arg,ENST00000428430,;ABLIM1,upstream_gene_variant,,ENST00000440467,;ABLIM1,missense_variant,p.His273Arg,ENST00000392955,;ABLIM1,missense_variant,p.His273Arg,ENST00000369256,; C ENSG00000099204 ENST00000277895 Transcript missense_variant 1096/2657 998/2337 333/778 H/R cAt/cGt COSM1474374,COSM1474373,COSM1474375 1 ABLIM1 HGNC 78 protein_coding YES CCDS7590.1 ENSP00000277895 ABLM1_HUMAN UPI0000418D06 NM_002313.5 deleterious(0) probably_damaging(0.988) 8/23 PROSITE_profiles:PS50023,hmmpanther:PTHR24213:SF18,hmmpanther:PTHR24213,Gene3D:2.10.110.10,SMART_domains:SM00132,Superfamily_domains:SSF57716 1,1,1 MODERATE 1 SNV 1,1,1 10 116247760 116247760 T C SNP ABLIM1 ENST00000277895 human ensembl 69_37n -1 known missense c.998 p.H333R 1000 smart_Znf_LIM pfam_Znf_LIM,pfam_Villin_headpiece,superfamily_Villin_headpiece,smart_Znf_LIM,smart_Villin_headpiece,pfscan_Villin_headpiece,pfscan_Znf_LIM - no_errors ABLIM1 HGNC ENSG00000099204 77 0 0 36 13 26.53 - - - 10 116247760 116247760 T C SNP ABLIM1 ENST00000277895 human ensembl 69_37n -1 known missense c.998 p.H333R 1000 smart_Znf_LIM pfam_Znf_LIM,pfam_Villin_headpiece,superfamily_Villin_headpiece,smart_Znf_LIM,smart_Villin_headpiece,pfscan_Villin_headpiece,pfscan_Znf_LIM - no_errors ABLIM1 HGNC - - - Putative_Driver Test driver Class 1 Class annotation ADAMTS20 genome.wustl.edu GRCh37 12 43944926 43944926 -1 Missense_Mutation SNP T T C TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 2.85 Y80C getma.org/?cm=msa&ty=f&p=ATS20_HUMAN&rb=40&re=186&var=Y80C getma.org/?cm=var&var=hg19,12,43944926,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 p.Tyr80Cys p.Y80C ENST00000389420 2/39 0 0 ADAMTS20,missense_variant,p.Tyr80Cys,ENST00000389420,NM_025003.3;ADAMTS20,missense_variant,p.Tyr80Cys,ENST00000553158,; C ENSG00000173157 ENST00000389420 Transcript missense_variant 239/6076 239/5733 80/1910 Y/C tAt/tGt COSM1476552,COSM1476551 1 ADAMTS20 HGNC 17178 protein_coding YES CCDS31778.2 ENSP00000374071 ATS20_HUMAN UPI00004565F4 NM_025003.3 deleterious(0) probably_damaging(1) 2/39 hmmpanther:PTHR13723,hmmpanther:PTHR13723:SF165,Pfam_domain:PF01562 1,1 MODERATE 1 SNV 1,1 12 43944926 43944926 T C SNP ADAMTS20 ENST00000389420 human ensembl 69_37n -1 known missense c.239 p.Y80C 1000 pfam_Peptidase_M12B_N pfam_Pept_M12B_GON-ADAMTSs,pfam_Thrombospondin_1_rpt,pfam_Peptidase_M12B_N,pfam_ADAM_spacer1,pfam_Peptidase_M12B,superfamily_Thrombospondin_1_rpt,smart_Thrombospondin_1_rpt,prints_Peptidase_M12B_ADAM-TS,pfscan_Pept_M12B_GON-ADAMTSs,pfscan_Thrombospondin_1_rpt,pfscan_Peptidase_M12B - no_errors ADAMTS20 HGNC ENSG00000173157 50 0 0 19 17 45.95 - - - 12 43944926 43944926 T C SNP ADAMTS20 ENST00000389420 human ensembl 69_37n -1 known missense c.239 p.Y80C 1000 pfam_Peptidase_M12B_N pfam_Pept_M12B_GON-ADAMTSs,pfam_Thrombospondin_1_rpt,pfam_Peptidase_M12B_N,pfam_ADAM_spacer1,pfam_Peptidase_M12B,superfamily_Thrombospondin_1_rpt,smart_Thrombospondin_1_rpt,prints_Peptidase_M12B_ADAM-TS,pfscan_Pept_M12B_GON-ADAMTSs,pfscan_Thrombospondin_1_rpt,pfscan_Peptidase_M12B - no_errors ADAMTS20 HGNC - - - Unknown Class 4 Class annotation -DTNB genome.wustl.edu GRCh37 2 25678299 25678299 -1 Missense_Mutation SNP C G T TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 C C Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 2125 V382M getma.org/?cm=msa&ty=f&p=DTNB_HUMAN&rb=283&re=473&var=V382M getma.org/?cm=var&var=hg19,2,25678299,C,T&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.1144C>A p.Val382Met p.V382M ENST00000406818 11/21 0 0 DTNB,missense_variant,p.Val382Met,ENST00000406818,NM_001256303.1,NM_021907.4;DTNB,missense_variant,p.Val382Met,ENST00000407661,NM_183360.2,NM_001256304.1;DTNB,missense_variant,p.Val382Met,ENST00000404103,NM_033147.3;DTNB,missense_variant,p.Val382Met,ENST00000288642,;DTNB,missense_variant,p.Val325Met,ENST00000496972,NM_001256308.1;DTNB,missense_variant,p.Val178Met,ENST00000545439,;DTNB,intron_variant,,ENST00000407038,NM_033148.3;DTNB,intron_variant,,ENST00000407186,;DTNB,intron_variant,,ENST00000405222,NM_183361.2;DTNB,intron_variant,,ENST00000489756,;DTNB,intron_variant,,ENST00000481841,;DTNB,intron_variant,,ENST00000486555,;DTNB,3_prime_UTR_variant,,ENST00000398951,;DTNB,non_coding_transcript_exon_variant,,ENST00000485845,;DTNB,non_coding_transcript_exon_variant,,ENST00000479898,;DTNB,intron_variant,,ENST00000356599,;DTNB,intron_variant,,ENST00000482145,; T ENSG00000138101 ENST00000406818 Transcript missense_variant 1394/2474 1144/1884 382/627 V/M Gtg/Atg COSM3839175,COSM3839176 1 DTNB HGNC 3058 protein_coding YES CCDS46237.1 ENSP00000384084 DTNB_HUMAN Q53TC8_HUMAN,Q53T51_HUMAN,Q53SF9_HUMAN,Q53QV1_HUMAN,F8W9U0_HUMAN,E9PE76_HUMAN,E7ES64_HUMAN UPI0000129949 NM_001256303.1,NM_021907.4 deleterious(0.03) benign(0.379) 11/21 hmmpanther:PTHR11915:SF227,hmmpanther:PTHR11915,PIRSF_domain:PIRSF038204 1,1 MODERATE 1 SNV 1,1 2 25678299 25678299 C T SNP DTNB ENST00000406818 human ensembl 69_37n -1 known missense c.1144 p.V382M 1000 pirsf_Distrobrevin pfam_EF-hand_dom_typ1,pfam_EF-hand_dom_typ2,pfam_Znf_ZZ,smart_Znf_ZZ,pirsf_Distrobrevin,pfscan_Znf_ZZ - no_errors DTNB HGNC ENSG00000138101 35 0 0 9 9 50 - - - 2 25678299 25678299 C T SNP DTNB ENST00000406818 human ensembl 69_37n -1 known missense c.1144 p.V382M 1000 pirsf_Distrobrevin pfam_EF-hand_dom_typ1,pfam_EF-hand_dom_typ2,pfam_Znf_ZZ,smart_Znf_ZZ,pirsf_Distrobrevin,pfscan_Znf_ZZ - no_errors DTNB HGNC - - - Putative_Passenger Test passenger Class 1 Class annotation +DTNB genome.wustl.edu GRCh37 2 25678299 25678299 -1 Missense_Mutation SNP C G T TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 C C Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx medium 2125 V382M getma.org/?cm=msa&ty=f&p=DTNB_HUMAN&rb=283&re=473&var=V382M getma.org/?cm=var&var=hg19,2,25678299,C,T&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.1144C>A p.Val382Met p.V382M ENST00000406818 11/21 0 0 DTNB,missense_variant,p.Val382Met,ENST00000406818,NM_001256303.1,NM_021907.4;DTNB,missense_variant,p.Val382Met,ENST00000407661,NM_183360.2,NM_001256304.1;DTNB,missense_variant,p.Val382Met,ENST00000404103,NM_033147.3;DTNB,missense_variant,p.Val382Met,ENST00000288642,;DTNB,missense_variant,p.Val325Met,ENST00000496972,NM_001256308.1;DTNB,missense_variant,p.Val178Met,ENST00000545439,;DTNB,intron_variant,,ENST00000407038,NM_033148.3;DTNB,intron_variant,,ENST00000407186,;DTNB,intron_variant,,ENST00000405222,NM_183361.2;DTNB,intron_variant,,ENST00000489756,;DTNB,intron_variant,,ENST00000481841,;DTNB,intron_variant,,ENST00000486555,;DTNB,3_prime_UTR_variant,,ENST00000398951,;DTNB,non_coding_transcript_exon_variant,,ENST00000485845,;DTNB,non_coding_transcript_exon_variant,,ENST00000479898,;DTNB,intron_variant,,ENST00000356599,;DTNB,intron_variant,,ENST00000482145,; T ENSG00000138101 ENST00000406818 Transcript missense_variant 1394/2474 1144/1884 382/627 V/M Gtg/Atg COSM3839175,COSM3839176 1 DTNB HGNC 3058 protein_coding YES CCDS46237.1 ENSP00000384084 DTNB_HUMAN Q53TC8_HUMAN,Q53T51_HUMAN,Q53SF9_HUMAN,Q53QV1_HUMAN,F8W9U0_HUMAN,E9PE76_HUMAN,E7ES64_HUMAN UPI0000129949 NM_001256303.1,NM_021907.4 deleterious(0.03) benign(0.379) 11/21 hmmpanther:PTHR11915:SF227,hmmpanther:PTHR11915,PIRSF_domain:PIRSF038204 1,1 MODERATE 1 SNV 1,1 2 25678299 25678299 C T SNP DTNB ENST00000406818 human ensembl 69_37n -1 known missense c.1144 p.V382M 1000 pirsf_Distrobrevin pfam_EF-hand_dom_typ1,pfam_EF-hand_dom_typ2,pfam_Znf_ZZ,smart_Znf_ZZ,pirsf_Distrobrevin,pfscan_Znf_ZZ - no_errors DTNB HGNC ENSG00000138101 35 0 0 9 9 50 - - - 2 25678299 25678299 C T SNP DTNB ENST00000406818 human ensembl 69_37n -1 known missense c.1144 p.V382M 1000 pirsf_Distrobrevin pfam_EF-hand_dom_typ1,pfam_EF-hand_dom_typ2,pfam_Znf_ZZ,smart_Znf_ZZ,pirsf_Distrobrevin,pfscan_Znf_ZZ - no_errors DTNB HGNC - - - Putative_Passenger Test passenger Class 1 Class annotation TP53 genome.wustl.edu GRCh37 17 7578253 7578253 0 Missense_Mutation SNP C C A TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A C Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx medium 3.005 getma.org/?cm=msa&ty=f&p=P53_HUMAN&rb=95&re=289&var=G199V getma.org/pdb.php?prot=P53_HUMAN&from=95&to=289&var=G199V getma.org/?cm=var&var=hg19,17,7578253,C,A&fts=all ENST00000269305.4:c.596G>T p.Gly199Val p.G199V ENST00000269305 11-Jun 0 0 TP53,missense_variant,p.Gly199Val,ENST00000420246,NM_001126114.2,NM_001276696.1;TP53,missense_variant,p.Gly199Val,ENST00000455263,NM_001276695.1,NM_001126113.2;TP53,missense_variant,p.Gly199Val,ENST00000269305,NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_001126118.1;TP53,missense_variant,p.Gly199Val,ENST00000445888,;TP53,missense_variant,p.Gly199Val,ENST00000359597,;TP53,missense_variant,p.Gly199Val,ENST00000413465,;TP53,missense_variant,p.Gly67Val,ENST00000509690,;TP53,missense_variant,p.Gly106Val,ENST00000514944,;TP53,downstream_gene_variant,,ENST00000508793,;TP53,downstream_gene_variant,,ENST00000604348,;TP53,downstream_gene_variant,,ENST00000503591,;TP53,upstream_gene_variant,,ENST00000576024,;TP53,intron_variant,,ENST00000574684,;TP53,non_coding_transcript_exon_variant,,ENST00000510385,;TP53,non_coding_transcript_exon_variant,,ENST00000504290,;TP53,non_coding_transcript_exon_variant,,ENST00000504937,;TP53,non_coding_transcript_exon_variant,,ENST00000505014,; A ENSG00000141510 ENST00000269305 Transcript missense_variant 786/2579 596/1182 199 G/V gGa/gTa TP53_g.12665G>T,COSM44140,COSM255788,COSM255787,COSM255789,COSM3675525,COSM3675524,COSM255790 1 TP53 HGNC 11998 protein_coding YES CCDS11118.1 ENSP00000269305 P53_HUMAN S5LQU8_HUMAN,Q761V2_HUMAN,Q6IT77_HUMAN,Q1HGV1_HUMAN,Q0PKT5_HUMAN,L0ES54_HUMAN,L0EQ05_HUMAN,K7PPA8_HUMAN,H2EHT1_HUMAN,G4Y083_HUMAN,E9PCY9_HUMAN,E7ESS1_HUMAN,E7EMR6_HUMAN,B5AKF6_HUMAN,B4DNI2_HUMAN,A4GWD0_HUMAN,A4GWB8_HUMAN,A4GWB5_HUMAN,A4GW97_HUMAN,A4GW76_HUMAN,A4GW75_HUMAN,A4GW74_HUMAN,A4GW67_HUMAN,A2I9Z1_HUMAN,A2I9Z0_HUMAN UPI000002ED67 NM_001126112.2 deleterious(0) probably_damaging(1) 11-Jun Gene3D:2.60.40.720,Pfam_domain:PF00870,hmmpanther:PTHR11447,hmmpanther:PTHR11447:SF6,Superfamily_domains:SSF49417 0,1,1,1,1,1,1,1 MODERATE 1 SNV 0,1,1,1,1,1,1,1 17 7578253 7578253 C A SNP TP53 NM_000546.4 human genbank 58_37c -1 reviewed missense c.596 p.G199V 1 HMMPfam_P53|7Csuperfamily_p53-like transcription factors HMMPfam_P53_TAD|7CHMMPfam_P53|7Csuperfamily_p53-like transcription factors|7CPatternScan_P53|7CHMMPfam_P53_tetramer|7Csuperfamily_p53 tetramerization domain - 17 7578253 7578253 C A SNP TP53 NM_000546.4 human genbank 58_37c -1 reviewed missense c.596 p.G199V 1 HMMPfam_P53|7Csuperfamily_p53-like transcription factors HMMPfam_P53_TAD|7CHMMPfam_P53|7Csuperfamily_p53-like transcription factors|7CPatternScan_P53|7CHMMPfam_P53_tetramer|7Csuperfamily_p53 tetramerization domain - TP53 genome.wustl.edu GRCh37 17 7576851 7576851 0 Splice_Site SNP A A C novel unknown TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A A Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx ENST00000269305.4:c.993+2T>G p.X331_splice ENST00000269305 0 0 TP53,splice_donor_variant,,ENST00000420246,NM_001126114.2,NM_001276696.1;TP53,splice_donor_variant,,ENST00000455263,NM_001276695.1,NM_001126113.2;TP53,splice_donor_variant,,ENST00000269305,NM_001126112.2,NM_001276761.1,NM_001276760.1,NM_000546.5,NM_001126118.1;TP53,splice_donor_variant,,ENST00000445888,;TP53,splice_donor_variant,,ENST00000359597,;TP53,splice_donor_variant,,ENST00000576024,;TP53,intron_variant,,ENST00000413465,;TP53,downstream_gene_variant,,ENST00000509690,;TP53,downstream_gene_variant,,ENST00000508793,;TP53,downstream_gene_variant,,ENST00000604348,;TP53,downstream_gene_variant,,ENST00000503591,;TP53,downstream_gene_variant,,ENST00000514944,;TP53,downstream_gene_variant,,ENST00000574684,;TP53,splice_donor_variant,,ENST00000510385,;TP53,splice_donor_variant,,ENST00000504290,;TP53,splice_donor_variant,,ENST00000504937,;TP53,downstream_gene_variant,,ENST00000505014,; C ENSG00000141510 ENST00000269305 Transcript splice_donor_variant -/2579 993/1182 TP53_g.14067T>G,COSM29774,COSM146229 1 TP53 HGNC 11998 protein_coding YES CCDS11118.1 ENSP00000269305 P53_HUMAN S5LQU8_HUMAN,Q761V2_HUMAN,Q6IT77_HUMAN,Q1HGV1_HUMAN,Q0PKT5_HUMAN,L0ES54_HUMAN,L0EQ05_HUMAN,K7PPA8_HUMAN,H2EHT1_HUMAN,G4Y083_HUMAN,E9PCY9_HUMAN,E7ESS1_HUMAN,E7EMR6_HUMAN,B5AKF6_HUMAN,B4DNI2_HUMAN,A4GWD0_HUMAN,A4GWB8_HUMAN,A4GWB5_HUMAN,A4GW97_HUMAN,A4GW76_HUMAN,A4GW75_HUMAN,A4GW74_HUMAN,A4GW67_HUMAN,A2I9Z1_HUMAN,A2I9Z0_HUMAN UPI000002ED67 NM_001126112.2 10-Sep 0,1,1 HIGH 1 SNV 0,1,1 17 7576851 7576851 A C SNP TP53 NM_000546 human genbank 57_37b -1 reviewed splice_site c.993+2 e8+2 1 - - - 17 7576851 7576851 A C SNP TP53 NM_000546 human genbank 57_37b -1 reviewed splice_site c.993+2 e8+2 1 - - - -BRCA1 genome.wustl.edu GRCh37 17 41243581 41243581 0 Nonsense_Mutation SNP G G A rs80357262 TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 A G Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx 0 getma.org/?cm=var&var=hg19,17,41243581,G,A&fts=all ENST00000357654.3:c.3967C>T p.Gln1323Ter p.Q1323* ENST00000357654 23-Oct 0 0 BRCA1,stop_gained,p.Gln1027Ter,ENST00000309486,NM_007297.3;BRCA1,stop_gained,p.Gln1323Ter,ENST00000357654,NM_007294.3;BRCA1,stop_gained,p.Gln1323Ter,ENST00000346315,;BRCA1,stop_gained,p.Gln1323Ter,ENST00000354071,;BRCA1,stop_gained,p.Gln1323Ter,ENST00000471181,NM_007300.3;BRCA1,stop_gained,p.Gln1276Ter,ENST00000493795,;BRCA1,stop_gained,p.Gln88Ter,ENST00000461574,;BRCA1,intron_variant,,ENST00000352993,;BRCA1,intron_variant,,ENST00000351666,;BRCA1,intron_variant,,ENST00000468300,NM_007299.3;BRCA1,intron_variant,,ENST00000491747,NM_007298.3;BRCA1,intron_variant,,ENST00000478531,;BRCA1,intron_variant,,ENST00000493919,;BRCA1,intron_variant,,ENST00000484087,;BRCA1,intron_variant,,ENST00000591534,;BRCA1,intron_variant,,ENST00000487825,;BRCA1,intron_variant,,ENST00000586385,;BRCA1,intron_variant,,ENST00000591849,;BRCA1,downstream_gene_variant,,ENST00000470026,;BRCA1,downstream_gene_variant,,ENST00000477152,;BRCA1,downstream_gene_variant,,ENST00000494123,;BRCA1,downstream_gene_variant,,ENST00000473961,;BRCA1,downstream_gene_variant,,ENST00000497488,;BRCA1,downstream_gene_variant,,ENST00000476777,;BRCA1,3_prime_UTR_variant,,ENST00000461221,;BRCA1,non_coding_transcript_exon_variant,,ENST00000467274,;BRCA1,downstream_gene_variant,,ENST00000492859,;BRCA1,downstream_gene_variant,,ENST00000412061,; A ENSG00000012048 ENST00000357654 Transcript stop_gained 4086/7094 3967/5592 1323 Q/* Caa/Taa rs80357262 1 BRCA1 HGNC 1100 protein_coding CCDS11453.1 ENSP00000350283 BRCA1_HUMAN Q9UE29_HUMAN,Q9NQR3_HUMAN,Q92897_HUMAN,Q7KYU6_HUMAN,Q4EW25_HUMAN,Q3YB53_HUMAN,Q3YB50_HUMAN,Q3YB49_HUMAN,Q3LRH8_HUMAN,Q3B891_HUMAN,K7EPC7_HUMAN,K4K7V3_HUMAN,K4JXS7_HUMAN,K4JUB1_HUMAN,G4V503_HUMAN,G4V502_HUMAN,G4V500_HUMAN,G4V4Z8_HUMAN,G4V4Z7_HUMAN,G1UI37_HUMAN,E9PFZ0_HUMAN,E7EWN5_HUMAN,E7EP70_HUMAN,C9IZW4_HUMAN,C4PFY7_HUMAN UPI0000126AC8 NM_007294.3 23-Oct PIRSF_domain:PIRSF001734,hmmpanther:PTHR13763,hmmpanther:PTHR13763:SF0 not_provided,pathogenic HIGH SNV 1 17 41243581 41243581 G A SNP BRCA1 NM_007294.3 human genbank 58_37c -1 reviewed nonsense c.3967 p.Q1323* 0.949 HMMPfam_BRCT|7CHMMSmart_SM00292|7Csuperfamily_BRCT domain|7CHMMSmart_SM00184|7CPatternScan_ZF_RING_1|7CHMMPfam_zf-C3HC4|7Csuperfamily_RING/U-box - 17 41243581 41243581 G A SNP BRCA1 NM_007294.3 human genbank 58_37c -1 reviewed nonsense c.3967 p.Q1323* 0.949 HMMPfam_BRCT|7CHMMSmart_SM00292|7Csuperfamily_BRCT domain|7CHMMSmart_SM00184|7CPatternScan_ZF_RING_1|7CHMMPfam_zf-C3HC4|7Csuperfamily_RING/U-box - +BRCA1 genome.wustl.edu GRCh37 17 41243581 41243581 0 Nonsense_Mutation SNP G G A rs80357262 TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 A G Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx 0 getma.org/?cm=var&var=hg19,17,41243581,G,A&fts=all ENST00000357654.3:c.3967C>T p.Gln1323Ter p.Q1323* ENST00000357654 23-Oct 0 0 BRCA1,stop_gained,p.Gln1027Ter,ENST00000309486,NM_007297.3;BRCA1,stop_gained,p.Gln1323Ter,ENST00000357654,NM_007294.3;BRCA1,stop_gained,p.Gln1323Ter,ENST00000346315,;BRCA1,stop_gained,p.Gln1323Ter,ENST00000354071,;BRCA1,stop_gained,p.Gln1323Ter,ENST00000471181,NM_007300.3;BRCA1,stop_gained,p.Gln1276Ter,ENST00000493795,;BRCA1,stop_gained,p.Gln88Ter,ENST00000461574,;BRCA1,intron_variant,,ENST00000352993,;BRCA1,intron_variant,,ENST00000351666,;BRCA1,intron_variant,,ENST00000468300,NM_007299.3;BRCA1,intron_variant,,ENST00000491747,NM_007298.3;BRCA1,intron_variant,,ENST00000478531,;BRCA1,intron_variant,,ENST00000493919,;BRCA1,intron_variant,,ENST00000484087,;BRCA1,intron_variant,,ENST00000591534,;BRCA1,intron_variant,,ENST00000487825,;BRCA1,intron_variant,,ENST00000586385,;BRCA1,intron_variant,,ENST00000591849,;BRCA1,downstream_gene_variant,,ENST00000470026,;BRCA1,downstream_gene_variant,,ENST00000477152,;BRCA1,downstream_gene_variant,,ENST00000494123,;BRCA1,downstream_gene_variant,,ENST00000473961,;BRCA1,downstream_gene_variant,,ENST00000497488,;BRCA1,downstream_gene_variant,,ENST00000476777,;BRCA1,3_prime_UTR_variant,,ENST00000461221,;BRCA1,non_coding_transcript_exon_variant,,ENST00000467274,;BRCA1,downstream_gene_variant,,ENST00000492859,;BRCA1,downstream_gene_variant,,ENST00000412061,; A ENSG00000012048 ENST00000357654 Transcript stop_gained 4086/7094 3967/5592 1323 Q/* Caa/Taa rs80357262 1 BRCA1 HGNC 1100 protein_coding CCDS11453.1 ENSP00000350283 BRCA1_HUMAN Q9UE29_HUMAN,Q9NQR3_HUMAN,Q92897_HUMAN,Q7KYU6_HUMAN,Q4EW25_HUMAN,Q3YB53_HUMAN,Q3YB50_HUMAN,Q3YB49_HUMAN,Q3LRH8_HUMAN,Q3B891_HUMAN,K7EPC7_HUMAN,K4K7V3_HUMAN,K4JXS7_HUMAN,K4JUB1_HUMAN,G4V503_HUMAN,G4V502_HUMAN,G4V500_HUMAN,G4V4Z8_HUMAN,G4V4Z7_HUMAN,G1UI37_HUMAN,E9PFZ0_HUMAN,E7EWN5_HUMAN,E7EP70_HUMAN,C9IZW4_HUMAN,C4PFY7_HUMAN UPI0000126AC8 NM_007294.3 23-Oct PIRSF_domain:PIRSF001734,hmmpanther:PTHR13763,hmmpanther:PTHR13763:SF0 not_provided,pathogenic HIGH SNV 1 17 41243581 41243581 G A SNP BRCA1 NM_007294.3 human genbank 58_37c -1 reviewed nonsense c.3967 p.Q1323* 0.949 HMMPfam_BRCT|7CHMMSmart_SM00292|7Csuperfamily_BRCT domain|7CHMMSmart_SM00184|7CPatternScan_ZF_RING_1|7CHMMPfam_zf-C3HC4|7Csuperfamily_RING/U-box - 17 41243581 41243581 G A SNP BRCA1 NM_007294.3 human genbank 58_37c -1 reviewed nonsense c.3967 p.Q1323* 0.949 HMMPfam_BRCT|7CHMMSmart_SM00292|7Csuperfamily_BRCT domain|7CHMMSmart_SM00184|7CPatternScan_ZF_RING_1|7CHMMPfam_zf-C3HC4|7Csuperfamily_RING/U-box - BRCA1 genome.wustl.edu GRCh37 17 41201181 41201181 0 Missense_Mutation SNP C C A rs80357069 byCluster TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 C C Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx medium 2.25 getma.org/?cm=msa&ty=f&p=BRCA1_HUMAN&rb=1756&re=1842&var=G1788V getma.org/pdb.php?prot=BRCA1_HUMAN&from=1756&to=1842&var=G1788V getma.org/?cm=var&var=hg19,17,41201181,C,A&fts=all ENST00000357654.3:c.5363G>T p.Gly1788Val p.G1788V ENST00000357654 21/23 0 0 BRCA1,missense_variant,p.Gly1492Val,ENST00000309486,NM_007297.3;BRCA1,missense_variant,p.Gly1788Val,ENST00000357654,NM_007294.3;BRCA1,missense_variant,p.Gly1549Val,ENST00000346315,;BRCA1,missense_variant,p.Gly1523Val,ENST00000354071,;BRCA1,missense_variant,p.Gly1809Val,ENST00000471181,NM_007300.3;BRCA1,missense_variant,p.Gly1741Val,ENST00000493795,;BRCA1,missense_variant,p.Gly646Val,ENST00000352993,;BRCA1,missense_variant,p.Gly605Val,ENST00000351666,;BRCA1,missense_variant,p.Gly684Val,ENST00000491747,NM_007298.3;BRCA1,missense_variant,p.Gly279Val,ENST00000591534,;BRCA1,missense_variant,p.Gly98Val,ENST00000586385,;BRCA1,missense_variant,p.Gly21Val,ENST00000591849,;BRCA1,intron_variant,,ENST00000468300,NM_007299.3;BRCA1,3_prime_UTR_variant,,ENST00000461221,; A ENSG00000012048 ENST00000357654 Transcript missense_variant 5482/7094 5363/5592 1788 G/V gGt/gTt rs80357069,COSM436662 1 BRCA1 HGNC 1100 protein_coding CCDS11453.1 ENSP00000350283 BRCA1_HUMAN Q9UE29_HUMAN,Q9NQR3_HUMAN,Q92897_HUMAN,Q7KYU6_HUMAN,Q4EW25_HUMAN,Q3YB53_HUMAN,Q3YB50_HUMAN,Q3YB49_HUMAN,Q3LRH8_HUMAN,Q3B891_HUMAN,K7EPC7_HUMAN,K4K7V3_HUMAN,K4JXS7_HUMAN,K4JUB1_HUMAN,G4V503_HUMAN,G4V502_HUMAN,G4V500_HUMAN,G4V4Z8_HUMAN,G4V4Z7_HUMAN,G1UI37_HUMAN,E9PFZ0_HUMAN,E7EWN5_HUMAN,E7EP70_HUMAN,C9IZW4_HUMAN,C4PFY7_HUMAN UPI0000126AC8 NM_007294.3 deleterious(0) benign(0.031) 21/23 Gene3D:3.40.50.10190,Pfam_domain:PF00533,PIRSF_domain:PIRSF001734,Prints_domain:PR00493,PROSITE_profiles:PS50172,hmmpanther:PTHR13763,hmmpanther:PTHR13763:SF0,SMART_domains:SM00292,Superfamily_domains:SSF52113 not_provided,pathogenic 0,1 MODERATE SNV 1,1 17 41201181 41201181 C A SNP BRCA1 NM_007294 human genbank 57_37b -1 reviewed missense c.5363 p.G1788V 1 HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain superfamily_RING/U-box,HMMSmart_SM00184,HMMPfam_zf-C3HC4,PatternScan_ZF_RING_1,HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain - 17 41201181 41201181 C A SNP BRCA1 NM_007294 human genbank 57_37b -1 reviewed missense c.5363 p.G1788V 1 HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain superfamily_RING/U-box,HMMSmart_SM00184,HMMPfam_zf-C3HC4,PatternScan_ZF_RING_1,HMMPfam_BRCT,HMMSmart_SM00292,superfamily_BRCT domain - -ATM genome.wustl.edu GRCh37 11 108173702 108173702 0 Frame_Shift_Del DEL G G - TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 - G Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx ENST00000278616.4:c.5443delG p.Asp1815ThrfsTer13 p.D1815Tfs*13 ENST00000278616 36/63 0 0 ATM,frameshift_variant,p.Asp1815ThrfsTer13,ENST00000278616,NM_000051.3;ATM,frameshift_variant,p.Asp1815ThrfsTer13,ENST00000452508,;ATM,non_coding_transcript_exon_variant,,ENST00000524792,;ATM,non_coding_transcript_exon_variant,,ENST00000533690,;ATM,non_coding_transcript_exon_variant,,ENST00000534625,;ATM,upstream_gene_variant,,ENST00000529588,; - ENSG00000149311 ENST00000278616 Transcript frameshift_variant 5827/13147 5442/9171 1814 L/X ttG/tt rs772138812 1 ATM HGNC 795 protein_coding YES CCDS31669.1 ENSP00000278616 ATM_HUMAN M0QXY8_HUMAN,E9PRG7_HUMAN,E9PIN0_HUMAN UPI0000DBEF44 NM_000051.3 36/63 hmmpanther:PTHR11139,hmmpanther:PTHR11139:SF66 HIGH 1 deletion 1 11 108173702 108173702 G - DEL ATM NM_000051.3 human genbank 58_37c 1 reviewed frame_shift_del c.5442 p.D1815fs 1 superfamily_ARM repeat superfamily_ARM repeat|7CHMMPfam_FAT|7Csuperfamily_Protein kinase-like (PK-like)|7CHMMPfam_PI3_PI4_kinase|7CHMMSmart_SM00146|7CPatternScan_PI3_4_KINASE_1|7CPatternScan_PI3_4_KINASE_2|7CHMMPfam_FATC - 11 108173702 108173702 G - DEL ATM NM_000051.3 human genbank 58_37c 1 reviewed frame_shift_del c.5442 p.D1815fs 1 superfamily_ARM repeat superfamily_ARM repeat|7CHMMPfam_FAT|7Csuperfamily_Protein kinase-like (PK-like)|7CHMMPfam_PI3_PI4_kinase|7CHMMSmart_SM00146|7CPatternScan_PI3_4_KINASE_1|7CPatternScan_PI3_4_KINASE_2|7CHMMPfam_FATC - +ATM genome.wustl.edu GRCh37 11 108173702 108173702 0 Frame_Shift_Del DEL G G - TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 - G Unknown Germline Phase_IV Capture 1 dbGAP IlluminaGAIIx ENST00000278616.4:c.5443delG p.Asp1815ThrfsTer13 p.D1815Tfs*13 ENST00000278616 36/63 0 0 ATM,frameshift_variant,p.Asp1815ThrfsTer13,ENST00000278616,NM_000051.3;ATM,frameshift_variant,p.Asp1815ThrfsTer13,ENST00000452508,;ATM,non_coding_transcript_exon_variant,,ENST00000524792,;ATM,non_coding_transcript_exon_variant,,ENST00000533690,;ATM,non_coding_transcript_exon_variant,,ENST00000534625,;ATM,upstream_gene_variant,,ENST00000529588,; - ENSG00000149311 ENST00000278616 Transcript frameshift_variant 5827/13147 5442/9171 1814 L/X ttG/tt rs772138812 1 ATM HGNC 795 protein_coding YES CCDS31669.1 ENSP00000278616 ATM_HUMAN M0QXY8_HUMAN,E9PRG7_HUMAN,E9PIN0_HUMAN UPI0000DBEF44 NM_000051.3 36/63 hmmpanther:PTHR11139,hmmpanther:PTHR11139:SF66 HIGH 1 deletion 1 11 108173702 108173702 G - DEL ATM NM_000051.3 human genbank 58_37c 1 reviewed frame_shift_del c.5442 p.D1815fs 1 superfamily_ARM repeat superfamily_ARM repeat|7CHMMPfam_FAT|7Csuperfamily_Protein kinase-like (PK-like)|7CHMMPfam_PI3_PI4_kinase|7CHMMSmart_SM00146|7CPatternScan_PI3_4_KINASE_1|7CPatternScan_PI3_4_KINASE_2|7CHMMPfam_FATC - 11 108173702 108173702 G - DEL ATM NM_000051.3 human genbank 58_37c 1 reviewed frame_shift_del c.5442 p.D1815fs 1 superfamily_ARM repeat superfamily_ARM repeat|7CHMMPfam_FAT|7Csuperfamily_Protein kinase-like (PK-like)|7CHMMPfam_PI3_PI4_kinase|7CHMMSmart_SM00146|7CPatternScan_PI3_4_KINASE_1|7CPatternScan_PI3_4_KINASE_2|7CHMMPfam_FATC - ATM genome.wustl.edu GRCh37 11 108106472 108106472 0 Frame_Shift_Del DEL T T - novel unknown TCGA-A1-A0SB-01 TCGA-A1-A0SB-10 T T Unknown Somatic Phase_IV Capture 1 dbGAP Illumina GAIIx ENST00000278616.4:c.409delT p.Tyr137ThrfsTer16 p.Y137Tfs*16 ENST00000278616 May-63 0 0 ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000278616,NM_000051.3;ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000452508,;ATM,frameshift_variant,p.Tyr137ThrfsTer16,ENST00000527805,;ATM,intron_variant,,ENST00000527891,;ATM,downstream_gene_variant,,ENST00000601453,;ATM,non_coding_transcript_exon_variant,,ENST00000530958,; - ENSG00000149311 ENST00000278616 Transcript frameshift_variant 792/13147 407/9171 136 I/X aTt/at COSM428356,COSM1474979 1 ATM HGNC 795 protein_coding YES CCDS31669.1 ENSP00000278616 ATM_HUMAN M0QXY8_HUMAN,E9PRG7_HUMAN,E9PIN0_HUMAN UPI0000DBEF44 NM_000051.3 May-63 Pfam_domain:PF11640,hmmpanther:PTHR11139,hmmpanther:PTHR11139:SF66 1,1 HIGH 1 deletion 2 1,1 11 108106472 108106472 T - DEL ATM NM_000051 human genbank 57_37b 1 reviewed frame_shift_del c.407 p.Y137fs 0.013 superfamily_ARM repeat,HMMPfam_FAT,superfamily_Protein kinase-like (PK-like),HMMPfam_PI3_PI4_kinase,HMMSmart_SM00146,PatternScan_PI3_4_KINASE_1,PatternScan_PI3_4_KINASE_2,HMMPfam_FATC (deletion:cds_exon[108106397,108106561]) 11 108106472 108106472 T - DEL ATM NM_000051 human genbank 57_37b 1 reviewed frame_shift_del c.407 p.Y137fs 0.013 superfamily_ARM repeat,HMMPfam_FAT,superfamily_Protein kinase-like (PK-like),HMMPfam_PI3_PI4_kinase,HMMSmart_SM00146,PatternScan_PI3_4_KINASE_1,PatternScan_PI3_4_KINASE_2,HMMPfam_FATC (deletion:cds_exon[108106397,108106561]) -KAT2A genome.wustl.edu GRCh37 17 40272381 40272381 -1 Silent SNP G G A TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 G G Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.471C>T p.= p.H157H ENST00000225916 3/18 0 0 KAT2A,synonymous_variant,p.=,ENST00000225916,NM_021078.2;CTD-2132N18.3,synonymous_variant,p.=,ENST00000592574,;RAB5C,downstream_gene_variant,,ENST00000393860,NM_201434.2;RAB5C,downstream_gene_variant,,ENST00000346213,NM_004583.3;HSPB9,upstream_gene_variant,,ENST00000355067,NM_033194.2;CTD-2132N18.3,missense_variant,p.Thr150Met,ENST00000592248,;KAT2A,synonymous_variant,p.=,ENST00000465682,;CTD-2132N18.3,3_prime_UTR_variant,,ENST00000585562,;KAT2A,upstream_gene_variant,,ENST00000592310,;KAT2A,upstream_gene_variant,,ENST00000588759,; A ENSG00000108773 ENST00000225916 Transcript synonymous_variant 525/3109 471/2514 157/837 H caC/caT rs536716483,COSM1479581 1 KAT2A HGNC 4201 protein_coding YES CCDS11417.1 ENSP00000225916 KAT2A_HUMAN K7ERS6_HUMAN UPI000000D978 NM_021078.2 3/18 hmmpanther:PTHR22880:SF124,hmmpanther:PTHR22880,Pfam_domain:PF06466,PIRSF_domain:PIRSF003048 A:0.0002 A:0 A:0 A:0.001 A:0 A:0 0,1 LOW 1 SNV 0,1 17 40272381 40272381 G A SNP ENSG00000267261 ENST00000592248 human ensembl 69_37n -1 known missense c.449 p.T150M 486 smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho pfam_Small_GTPase,pfam_MIRO-like,pfam_Small_GTPase_ARF/SAR,pfam_Gtr1_RagA,smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho,prints_Small_GTPase,tigrfam_Small_GTP-bd_dom - no_errors CTD-2132N18.3 Clone_based_vega_gene ENSG00000267261 40 0 0 30 36 54.55 - - - 17 40272381 40272381 G A SNP ENSG00000267261 ENST00000592248 human ensembl 69_37n -1 known missense c.449 p.T150M 486 smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho pfam_Small_GTPase,pfam_MIRO-like,pfam_Small_GTPase_ARF/SAR,pfam_Gtr1_RagA,smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho,prints_Small_GTPase,tigrfam_Small_GTP-bd_dom - no_errors CTD-2132N18.3 Clone_based_vega_gene - - - Putative_Driver Test driver Class 1 Class annotation +KAT2A genome.wustl.edu GRCh37 17 40272381 40272381 -1 Silent SNP G G A TCGA-BH-NEW-01 TCGA-BH-NEW-10 G G Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.471C>T p.= p.H157H ENST00000225916 3/18 0 0 KAT2A,synonymous_variant,p.=,ENST00000225916,NM_021078.2;CTD-2132N18.3,synonymous_variant,p.=,ENST00000592574,;RAB5C,downstream_gene_variant,,ENST00000393860,NM_201434.2;RAB5C,downstream_gene_variant,,ENST00000346213,NM_004583.3;HSPB9,upstream_gene_variant,,ENST00000355067,NM_033194.2;CTD-2132N18.3,missense_variant,p.Thr150Met,ENST00000592248,;KAT2A,synonymous_variant,p.=,ENST00000465682,;CTD-2132N18.3,3_prime_UTR_variant,,ENST00000585562,;KAT2A,upstream_gene_variant,,ENST00000592310,;KAT2A,upstream_gene_variant,,ENST00000588759,; A ENSG00000108773 ENST00000225916 Transcript synonymous_variant 525/3109 471/2514 157/837 H caC/caT rs536716483,COSM1479581 1 KAT2A HGNC 4201 protein_coding YES CCDS11417.1 ENSP00000225916 KAT2A_HUMAN K7ERS6_HUMAN UPI000000D978 NM_021078.2 3/18 hmmpanther:PTHR22880:SF124,hmmpanther:PTHR22880,Pfam_domain:PF06466,PIRSF_domain:PIRSF003048 A:0.0002 A:0 A:0 A:0.001 A:0 A:0 0,1 LOW 1 SNV 0,1 17 40272381 40272381 G A SNP ENSG00000267261 ENST00000592248 human ensembl 69_37n -1 known missense c.449 p.T150M 486 smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho pfam_Small_GTPase,pfam_MIRO-like,pfam_Small_GTPase_ARF/SAR,pfam_Gtr1_RagA,smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho,prints_Small_GTPase,tigrfam_Small_GTP-bd_dom - no_errors CTD-2132N18.3 Clone_based_vega_gene ENSG00000267261 40 0 0 30 36 54.55 - - - 17 40272381 40272381 G A SNP ENSG00000267261 ENST00000592248 human ensembl 69_37n -1 known missense c.449 p.T150M 486 smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho pfam_Small_GTPase,pfam_MIRO-like,pfam_Small_GTPase_ARF/SAR,pfam_Gtr1_RagA,smart_Small_GTPase_Ras,smart_Small_GTPase_Rab_type,smart_Small_GTPase_Rho,prints_Small_GTPase,tigrfam_Small_GTP-bd_dom - no_errors CTD-2132N18.3 Clone_based_vega_gene - - - Putative_Driver Test driver Class 1 Class annotation MSH3 genome.wustl.edu GRCh37 5 80024722 80024722 1 Frame_Shift_Del DEL T T - TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.1508delT p.Leu503TrpfsTer5 p.L503Wfs*5 ENST00000265081 10/24 0 0 MSH3,frameshift_variant,p.Leu503TrpfsTer5,ENST00000265081,NM_002439.4;MSH3,non_coding_transcript_exon_variant,,ENST00000512258,; - ENSG00000113318 ENST00000265081 Transcript frameshift_variant 1586/4092 1506/3414 502/1137 S/X tcT/tc 1 MSH3 HGNC 7326 protein_coding YES CCDS34195.1 ENSP00000265081 MSH3_HUMAN UPI0000DBEE85 NM_002439.4 10/24 Superfamily_domains:SSF53150,Gene3D:3.30.420.110,Pfam_domain:PF05188,hmmpanther:PTHR11361,hmmpanther:PTHR11361:SF34 HIGH 1 deletion 2 5 80024722 80024722 T - DEL MSH3 ENST00000265081 human ensembl 69_37n 1 known frame_shift_del c.1506 p.L503fs 998 pfam_DNA_mismatch_repair_MutS_connt,superfamily_DNA_mismatch_repair_MutS_connt pfam_DNA_mismatch_repair_MutS_C,pfam_DNA_mismatch_repair_MutS_core,pfam_DNA_mismatch_repair_MutS_connt,pfam_DNA_mismatch_repair_MutS-lik_N,pfam_DNA_mismatch_repair_MutS_clamp,superfamily_DNA_mismatch_repair_MutS_core,superfamily_DNA_mismatch_repair_MutS_N,superfamily_DNA_mismatch_repair_MutS_connt,smart_DNA_mismatch_repair_MutS_core,smart_DNA_mismatch_repair_MutS_C - no_errors MSH3 HGNC ENSG00000113318 83 0 0 12 2 14.29 - - - 5 80024722 80024722 T - DEL MSH3 ENST00000265081 human ensembl 69_37n 1 known frame_shift_del c.1506 p.L503fs 998 pfam_DNA_mismatch_repair_MutS_connt,superfamily_DNA_mismatch_repair_MutS_connt pfam_DNA_mismatch_repair_MutS_C,pfam_DNA_mismatch_repair_MutS_core,pfam_DNA_mismatch_repair_MutS_connt,pfam_DNA_mismatch_repair_MutS-lik_N,pfam_DNA_mismatch_repair_MutS_clamp,superfamily_DNA_mismatch_repair_MutS_core,superfamily_DNA_mismatch_repair_MutS_N,superfamily_DNA_mismatch_repair_MutS_connt,smart_DNA_mismatch_repair_MutS_core,smart_DNA_mismatch_repair_MutS_C - no_errors MSH3 HGNC - - - Putative_Passenger Test passenger Class 3 Class annotation MYB genome.wustl.edu GRCh37 6 135507043 135507044 1 Frame_Shift_Ins INS - - A TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 - - Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.27dupA p.Tyr10IlefsTer2 p.Y10Ifs*2 ENST00000367814 2/15 0 0 MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000341911,NM_001130173.1,NM_001161658.1,NM_001161656.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000316528,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000442647,NM_001161660.1,NM_001130172.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000367814,NM_001161659.1,NM_005375.2;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525369,NM_001161657.1;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000527615,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528774,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000534121,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533624,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000534044,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000420123,;MYB,upstream_gene_variant,,ENST00000430686,;MYB,non_coding_transcript_exon_variant,,ENST00000531845,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000367812,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533837,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000438901,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525477,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000463282,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000339290,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533808,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525514,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000529586,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526889,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526320,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000531519,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000533384,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000531737,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000529262,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526565,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528015,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000526187,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525002,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528343,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528140,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000528345,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000525940,;MYB,frameshift_variant,p.Tyr10Ter,ENST00000531634,;MYB,frameshift_variant,p.Tyr10IlefsTer2,ENST00000524588,; A ENSG00000118513 ENST00000367814 Transcript frameshift_variant 212-213/3302 26-27/1923 9/640 I/IX ata/atAa COSM1487247,COSM1487248 1 MYB HGNC 7545 protein_coding CCDS5174.1 ENSP00000356788 MYB_HUMAN Q9UMI7_HUMAN,Q708J0_HUMAN,Q708E9_HUMAN,Q708E3_HUMAN UPI000012FAEA NM_001161659.1,NM_005375.2 2/15 hmmpanther:PTHR10641,hmmpanther:PTHR10641:SF454 1,1 HIGH insertion 1 1,1 6 135507043 135507044 - A INS MYB ENST00000341911 human ensembl 69_37n 1 known frame_shift_ins c.26_27 p.Y10fs 1.000:0.997 pfam_C-myb_C,pfam_SANT/Myb,pfam_Tscrpt_reg_Wos2-domain,superfamily_Homeodomain-like,smart_SANT/Myb,pfscan_Myb-like_dom - no_errors MYB HGNC ENSG00000118513 50 0 0 36 4 10 - - - 6 135507043 135507044 - A INS MYB ENST00000341911 human ensembl 69_37n 1 known frame_shift_ins c.26_27 p.Y10fs 1.000:0.997 pfam_C-myb_C,pfam_SANT/Myb,pfam_Tscrpt_reg_Wos2-domain,superfamily_Homeodomain-like,smart_SANT/Myb,pfscan_Myb-like_dom - no_errors MYB HGNC - - - Putative_Passenger Test passenger PIEZO1 genome.wustl.edu GRCh37 16 88790292 88790292 -1 Missense_Mutation SNP T T C TCGA-A1-A0SB-03 TCGA-A1-A0SB-30 T T Unknown Untested Somatic Phase_IV WXS none 1 dbGAP Illumina GAIIx low 1.18 Q1441R getma.org/?cm=msa&ty=f&p=PIEZ1_HUMAN&rb=58&re=1627&var=Q1441R getma.org/?cm=var&var=hg19,16,88790292,T,C&fts=all db9d40fb-bfce-4c3b-a6c2-41c5c88982f1 a3254f8e-3bbd-42fc-abea-a5f25b7648b3 c.4322A>G p.Gln1441Arg p.Q1441R ENST00000301015 31/51 0 0 PIEZO1,missense_variant,p.Gln1441Arg,ENST00000301015,NM_001142864.2;PIEZO1,missense_variant,p.Gln115Arg,ENST00000474606,;PIEZO1,upstream_gene_variant,,ENST00000327397,;PIEZO1,upstream_gene_variant,,ENST00000466823,;RP5-1142A6.9,downstream_gene_variant,,ENST00000564984,;PIEZO1,non_coding_transcript_exon_variant,,ENST00000566414,;PIEZO1,upstream_gene_variant,,ENST00000419505,;PIEZO1,upstream_gene_variant,,ENST00000497793,;PIEZO1,upstream_gene_variant,,ENST00000495568,;PIEZO1,downstream_gene_variant,,ENST00000475586,;PIEZO1,downstream_gene_variant,,ENST00000491917,; C ENSG00000103335 ENST00000301015 Transcript missense_variant 4569/8072 4322/7566 1441/2521 Q/R cAg/cGg COSM1479166 1 PIEZO1 HGNC 28993 protein_coding YES CCDS54058.1 ENSP00000301015 PIEZ1_HUMAN UPI0001B300F3 NM_001142864.2 tolerated(0.25) possibly_damaging(0.78) 31/51 hmmpanther:PTHR13167,hmmpanther:PTHR13167:SF40 1 MODERATE 1 SNV 1 16 88790292 88790292 T C SNP PIEZO1 ENST00000301015 human ensembl 69_37n -1 novel missense c.4322 p.Q1441R 1000 pfam_DUF3595 - no_errors PIEZO1 HGNC ENSG00000103335 37 0 0 20 8 28.57 - - - 16 88790292 88790292 T C SNP PIEZO1 ENST00000301015 human ensembl 69_37n -1 novel missense c.4322 p.Q1441R 1000 pfam_DUF3595 - no_errors PIEZO1 HGNC - - - Putative_Passenger Test passenger Class 3 Class annotation From c6eddbba71c3d0d7e735836e488b59d7373e5610 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 13:45:16 +0200 Subject: [PATCH 025/130] Specify that data_directory for incremental data --- scripts/importer/metaImport.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/importer/metaImport.py b/scripts/importer/metaImport.py index 1e06049e..7b6a1a35 100755 --- a/scripts/importer/metaImport.py +++ b/scripts/importer/metaImport.py @@ -60,7 +60,7 @@ def interface(): data_source_group.add_argument('-s', '--study_directory', type=str, help='path to study directory.') data_source_group.add_argument('-d', '--data_directory', - type=str, help='path to directory.') + type=str, help='path to data directory for incremental upload.') portal_mode_group = parser.add_mutually_exclusive_group() portal_mode_group.add_argument('-u', '--url_server', type=str, From 595d24f605060d3c7cd4835847ae71f998f37d7f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 14:12:43 +0200 Subject: [PATCH 026/130] Disambiguate clinical data constants names Not it was easy to be confused where sample and clinical_sample (attributes), patient and clinical_patient (attributes) related code --- .../cbio/portal/dao/DaoClinicalData.java | 62 +++++++++---------- .../portal/scripts/ImportClinicalData.java | 4 +- 2 files changed, 33 insertions(+), 33 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java index 1b2ccd25..8a5aaf30 100755 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java @@ -47,15 +47,15 @@ */ public final class DaoClinicalData { - public static final String SAMPLE_TABLE = "clinical_sample"; - public static final String PATIENT_TABLE = "clinical_patient"; + public static final String SAMPLE_ATTRIBUTES_TABLE = "clinical_sample"; + public static final String PATIENT_ATTRIBUTES_TABLE = "clinical_patient"; - private static final String SAMPLE_INSERT = "INSERT INTO " + SAMPLE_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; - private static final String PATIENT_INSERT = "INSERT INTO " + PATIENT_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; + private static final String SAMPLE_ATTRIBUTES_INSERT = "INSERT INTO " + SAMPLE_ATTRIBUTES_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; + private static final String PATIENT_ATTRIBUTES_INSERT = "INSERT INTO " + PATIENT_ATTRIBUTES_TABLE + "(`INTERNAL_ID`,`ATTR_ID`,`ATTR_VALUE`) VALUES(?,?,?)"; - private static final String SAMPLE_DELETE = "DELETE FROM " + SAMPLE_TABLE + " WHERE `INTERNAL_ID` = ?"; + private static final String SAMPLE_ATTRIBUTES_DELETE = "DELETE FROM " + SAMPLE_ATTRIBUTES_TABLE + " WHERE `INTERNAL_ID` = ?"; - private static final String PATIENT_DELETE = "DELETE FROM " + PATIENT_TABLE + " WHERE `INTERNAL_ID` = ?"; + private static final String PATIENT_ATTRIBUTES_DELETE = "DELETE FROM " + PATIENT_ATTRIBUTES_TABLE + " WHERE `INTERNAL_ID` = ?"; private static final Map sampleAttributes = new HashMap(); private static final Map patientAttributes = new HashMap(); @@ -64,8 +64,8 @@ private DaoClinicalData() {} public static synchronized void reCache() { clearCache(); - cacheAttributes(SAMPLE_TABLE, sampleAttributes); - cacheAttributes(PATIENT_TABLE, patientAttributes); + cacheAttributes(SAMPLE_ATTRIBUTES_TABLE, sampleAttributes); + cacheAttributes(PATIENT_ATTRIBUTES_TABLE, patientAttributes); } private static void clearCache() @@ -98,13 +98,13 @@ private static void cacheAttributes(String table, Map cache) public static int addSampleDatum(int internalSampleId, String attrId, String attrVal) throws DaoException { sampleAttributes.put(attrId, attrId); - return addDatum(SAMPLE_INSERT, SAMPLE_TABLE, internalSampleId, attrId, attrVal); + return addDatum(SAMPLE_ATTRIBUTES_INSERT, SAMPLE_ATTRIBUTES_TABLE, internalSampleId, attrId, attrVal); } public static int addPatientDatum(int internalPatientId, String attrId, String attrVal) throws DaoException { patientAttributes.put(attrId, attrId); - return addDatum(PATIENT_INSERT, PATIENT_TABLE, internalPatientId, attrId, attrVal); + return addDatum(PATIENT_ATTRIBUTES_INSERT, PATIENT_ATTRIBUTES_TABLE, internalPatientId, attrId, attrVal); } public static int addDatum(String query, String tableName, @@ -129,7 +129,7 @@ public static int addDatum(String query, String tableName, pstmt.setString(3, attrVal); int toReturn = pstmt.executeUpdate(); - if (tableName.equals(PATIENT_TABLE)) { + if (tableName.equals(PATIENT_ATTRIBUTES_TABLE)) { patientAttributes.put(attrId, attrId); } else { @@ -168,10 +168,10 @@ private static int getInternalCancerStudyId(String cancerStudyId) throws DaoExce private static String getAttributeTable(String attrId) throws DaoException { if (sampleAttributes.containsKey(attrId)) { - return SAMPLE_TABLE; + return SAMPLE_ATTRIBUTES_TABLE; } else if (patientAttributes.containsKey(attrId)) { - return (PATIENT_TABLE); + return (PATIENT_ATTRIBUTES_TABLE); } else { return null; @@ -212,7 +212,7 @@ public static List getDataByPatientId(int cancerStudyId, String pa { List internalIds = new ArrayList(); internalIds.add(DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudyId, patientId).getInternalId()); - return getDataByInternalIds(cancerStudyId, PATIENT_TABLE, internalIds); + return getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, internalIds); } private static List getDataByInternalIds(int internalCancerStudyId, String table, List internalIds) throws DaoException @@ -250,7 +250,7 @@ public static List getData(String cancerStudyId) throws DaoExcepti public static List getData(int cancerStudyId) throws DaoException { - return getDataByInternalIds(cancerStudyId, PATIENT_TABLE, getPatientIdsByCancerStudy(cancerStudyId)); + return getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, getPatientIdsByCancerStudy(cancerStudyId)); } private static List getPatientIdsByCancerStudy(int cancerStudyId) @@ -284,7 +284,7 @@ public static List getData(int cancerStudyId, Collection p patientIdsInt.add(DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudyId, patientId).getInternalId()); } - return getDataByInternalIds(cancerStudyId, PATIENT_TABLE, patientIdsInt); + return getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, patientIdsInt); } public static List getSampleAndPatientData(int cancerStudyId, Collection sampleIds) throws DaoException @@ -305,9 +305,9 @@ public static List getSampleAndPatientData(int cancerStudyId, Coll } sampleIdsForPatient.add(sampleId); } - List sampleClinicalData = getDataByInternalIds(cancerStudyId, SAMPLE_TABLE, sampleIdsInt); + List sampleClinicalData = getDataByInternalIds(cancerStudyId, SAMPLE_ATTRIBUTES_TABLE, sampleIdsInt); - List patientClinicalData = getDataByInternalIds(cancerStudyId, PATIENT_TABLE, patientIdsInt); + List patientClinicalData = getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, patientIdsInt); for (ClinicalData cd : patientClinicalData) { String stablePatientId = cd.getStableId(); Set sampleIdsForPatient = mapPatientIdSampleIds.get(stablePatientId); @@ -339,9 +339,9 @@ public static List getSampleAndPatientData(int cancerStudyId, Coll } sampleIdsForPatient.add(sampleId); } - List sampleClinicalData = getDataByInternalIds(cancerStudyId, SAMPLE_TABLE, sampleIdsInt, Collections.singletonList(attr.getAttrId())); + List sampleClinicalData = getDataByInternalIds(cancerStudyId, SAMPLE_ATTRIBUTES_TABLE, sampleIdsInt, Collections.singletonList(attr.getAttrId())); - List patientClinicalData = getDataByInternalIds(cancerStudyId, PATIENT_TABLE, patientIdsInt, Collections.singletonList(attr.getAttrId())); + List patientClinicalData = getDataByInternalIds(cancerStudyId, PATIENT_ATTRIBUTES_TABLE, patientIdsInt, Collections.singletonList(attr.getAttrId())); for (ClinicalData cd : patientClinicalData) { String stablePatientId = cd.getStableId(); Set sampleIdsForPatient = mapPatientIdSampleIds.get(stablePatientId); @@ -364,16 +364,16 @@ public static List getSampleData(int cancerStudyId, Collection getSampleData(int cancerStudyId, Collection getData(String cancerStudyId, Collection patientIds, ClinicalAttribute attr) throws DaoException @@ -402,7 +402,7 @@ public static List getData(String cancerStudyId, Collection getDataByInternalIds(int internalCancerStudyId, String table, List internalIds, Collection attributeIds) throws DaoException @@ -453,7 +453,7 @@ public static List getDataByAttributeIds(int internalCancerStudyId while(rs.next()) { Integer patientId = rs.getInt("INTERNAL_ID"); if (patients.contains(patientId)) { - clinicals.add(extract(PATIENT_TABLE, internalCancerStudyId, rs)); + clinicals.add(extract(PATIENT_ATTRIBUTES_TABLE, internalCancerStudyId, rs)); } } } @@ -478,7 +478,7 @@ private static ClinicalData extract(String table, int internalCancerStudyId, Res private static String getStableIdFromInternalId(String table, int internalId) { - if (table.equals(SAMPLE_TABLE)) { + if (table.equals(SAMPLE_ATTRIBUTES_TABLE)) { return DaoSample.getSampleById(internalId).getStableId(); } else { @@ -606,13 +606,13 @@ public static List getDataByPatientIds(int cancerStudyId, List getPatientsByAttribute(int cancerStudy, String paramName, String paramValue) throws DaoException { - List ids = getIdsByAttribute(cancerStudy, paramName, paramValue, PATIENT_TABLE); + List ids = getIdsByAttribute(cancerStudy, paramName, paramValue, PATIENT_ATTRIBUTES_TABLE); return InternalIdUtil.getPatientsById(ids); } public static List getSamplesByAttribute(int cancerStudy, String paramName, String paramValue) throws DaoException { - List ids = getIdsByAttribute(cancerStudy, paramName, paramValue, SAMPLE_TABLE); + List ids = getIdsByAttribute(cancerStudy, paramName, paramValue, SAMPLE_ATTRIBUTES_TABLE); return InternalIdUtil.getSamplesById(ids); } @@ -682,13 +682,13 @@ public static Map> getCancerTypeInfoBySamples(List s } } - public static void removePatientData(int internalPatientId) throws DaoException { + public static void removePatientAttributesData(int internalPatientId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoClinicalData.class); - pstmt = con.prepareStatement(PATIENT_DELETE); + pstmt = con.prepareStatement(PATIENT_ATTRIBUTES_DELETE); pstmt.setInt(1, internalPatientId); pstmt.executeUpdate(); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java index 37d79125..11519601 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java @@ -337,7 +337,7 @@ private boolean addDatum(String[] fields, List columnAttrs, M if (sample != null) { internalSampleId = sample.getInternalId(); if (overwriteExisting && this.attributesType == AttributeTypes.SAMPLE_ATTRIBUTES) { - DaoClinicalData.removeSampleData(internalSampleId); + DaoClinicalData.removeSampleAttributesData(internalSampleId); } else { //this should be a WARNING in case of TCGA studies (see https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415) //and an ERROR in other studies. I.e. a sample should occur only once in clinical file! @@ -359,7 +359,7 @@ private boolean addDatum(String[] fields, List columnAttrs, M //patient exists, get internal id: internalPatientId = patient.getInternalId(); if (overwriteExisting && this.attributesType == AttributeTypes.PATIENT_ATTRIBUTES) { - DaoClinicalData.removePatientData(internalPatientId); + DaoClinicalData.removePatientAttributesData(internalPatientId); } } else { From c8b4c7385a61e2c544d4ee03b156abfbf51266fc Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 14:19:44 +0200 Subject: [PATCH 027/130] Remove not necessary TODO comments --- src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java index 9fa36f68..3a7472c0 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java @@ -1509,9 +1509,6 @@ public static void deleteAllRecordsInGeneticProfileForSample(long geneticProfile pstmt.setLong(1, geneticProfileId); pstmt.setLong(2, internalSampleId); pstmt.executeUpdate(); - - // TODO Remove row in mutation_event if it does not have mutations left - // TODO Remove profile if no mutations nor mutation_event(s) left } catch (SQLException e) { throw new DaoException(e); } finally { From efd34d866f93a817233709099d60ce13f6cbedd8 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 14:25:20 +0200 Subject: [PATCH 028/130] Remove MSK copyright mistakenly copy-pasted --- .../portal/scripts/UpdateCaseListsSampleIds.java | 15 --------------- .../TestIncrementalMutationsImport.java | 15 --------------- .../TestIncrementalPatientsImport.java | 15 --------------- .../incremental/TestIncrementalSamplesImport.java | 15 --------------- .../incremental/TestUpdateCaseListsSampleIds.java | 15 --------------- 5 files changed, 75 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index 390398dc..daf94283 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -1,18 +1,3 @@ -/* - * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. - * - * This library is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS - * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder - * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no - * obligations to provide maintenance, support, updates, enhancements or - * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be - * liable to any party for direct, indirect, special, incidental or - * consequential damages, including lost profits, arising out of the use of this - * software and its documentation, even if Memorial Sloan-Kettering Cancer - * Center has been advised of the possibility of such damage. - */ - /* * This file is part of cBioPortal. * diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java index 7293d57a..e4def0ed 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java @@ -1,18 +1,3 @@ -/* - * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. - * - * This library is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS - * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder - * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no - * obligations to provide maintenance, support, updates, enhancements or - * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be - * liable to any party for direct, indirect, special, incidental or - * consequential damages, including lost profits, arising out of the use of this - * software and its documentation, even if Memorial Sloan-Kettering Cancer - * Center has been advised of the possibility of such damage. - */ - /* * This file is part of cBioPortal. * diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java index f5e1884e..7dcf4d3c 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalPatientsImport.java @@ -1,18 +1,3 @@ -/* - * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. - * - * This library is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS - * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder - * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no - * obligations to provide maintenance, support, updates, enhancements or - * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be - * liable to any party for direct, indirect, special, incidental or - * consequential damages, including lost profits, arising out of the use of this - * software and its documentation, even if Memorial Sloan-Kettering Cancer - * Center has been advised of the possibility of such damage. - */ - /* * This file is part of cBioPortal. * diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java index f6dfd0a3..93fea8ce 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalSamplesImport.java @@ -1,18 +1,3 @@ -/* - * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. - * - * This library is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS - * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder - * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no - * obligations to provide maintenance, support, updates, enhancements or - * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be - * liable to any party for direct, indirect, special, incidental or - * consequential damages, including lost profits, arising out of the use of this - * software and its documentation, even if Memorial Sloan-Kettering Cancer - * Center has been advised of the possibility of such damage. - */ - /* * This file is part of cBioPortal. * diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java index 21432d1b..e473955f 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestUpdateCaseListsSampleIds.java @@ -1,18 +1,3 @@ -/* - * Copyright (c) 2015 Memorial Sloan-Kettering Cancer Center. - * - * This library is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY, WITHOUT EVEN THE IMPLIED WARRANTY OF MERCHANTABILITY OR FITNESS - * FOR A PARTICULAR PURPOSE. The software and documentation provided hereunder - * is on an "as is" basis, and Memorial Sloan-Kettering Cancer Center has no - * obligations to provide maintenance, support, updates, enhancements or - * modifications. In no event shall Memorial Sloan-Kettering Cancer Center be - * liable to any party for direct, indirect, special, incidental or - * consequential damages, including lost profits, arising out of the use of this - * software and its documentation, even if Memorial Sloan-Kettering Cancer - * Center has been advised of the possibility of such damage. - */ - /* * This file is part of cBioPortal. * From 3b39e0d0e57aa156fe08188fd933844daf8f414e Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 14:26:00 +0200 Subject: [PATCH 029/130] Fix comment of UpdateCaseListsSampleIds.run() method --- .../org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index daf94283..fc34b1b7 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -46,7 +46,7 @@ public UpdateCaseListsSampleIds(String[] args) { } /** - * Imports clinical data and clinical attributes (from the worksheet) + * Updates case list sample ids from clinical sample and case list files */ public void run() { parseArguments(); From fc785f663f8227f2d139e4559bc6812456206c6b Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 14:28:48 +0200 Subject: [PATCH 030/130] Make --overwrite-existing flag description more generic This flag for command to upload molecular profile data --- src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java index c7fafb92..39fedd5f 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java @@ -252,7 +252,7 @@ public static OptionSet parseStandardDataAndMetaUpdateOptions(String[] args, Str .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); } parser.accepts("overwrite-existing", - "Flag that enables re-uploading molecular data that already exist (the same profile and sample id) in the database.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); + "Enables re-uploading molecular data that already exist for the given profile and sample.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); String progName = "importScript"; From e7829510b91561e6ed4126b30e8efdcc7c04880f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 14:37:34 +0200 Subject: [PATCH 031/130] Add TODO comments for possible reuse of the code --- .../org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index fc34b1b7..14627b88 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -31,6 +31,7 @@ import java.util.*; import java.util.stream.Collectors; +//TODO Can we unify this class with ImportSampleList? public class UpdateCaseListsSampleIds extends ConsoleRunnable { private File metaFile; @@ -59,6 +60,7 @@ public void run() { updateCaseLists(this.caseListSampleIdToSampleIds); } + // TODO Can we reuse this logic in ImportSampleList.importSampleList(File dataFile) as well private Map> readCaseListFiles() { LinkedHashMap> result = new LinkedHashMap<>(); for (File caseListFile: this.caseListFiles) { From b53c8c404317039689111665a03d4770437d8a02 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 16:02:01 +0200 Subject: [PATCH 032/130] Update case lists for multiple clinical sample files Potentially for different studies --- scripts/importer/cbioportalImporter.py | 10 +++++----- .../cbio/portal/scripts/UpdateCaseListsSampleIds.java | 7 ++++++- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index d1fdbb8a..f059282e 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -467,20 +467,20 @@ def process_data_directory(jvm_args, data_directory, update_generic_assay_entity not_supported_meta_types = meta_file_type_to_meta_files.keys() - INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES if not_supported_meta_types: raise NotImplementedError("These types do not support incremental upload: {}".format(", ".join(not_supported_meta_types))) - # TODO it's to fragile to rely on the order of types like that. Too implicit for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: meta_pairs = meta_file_type_to_meta_files[meta_file_type] for meta_pair in meta_pairs: meta_filename, meta_dictionary = meta_pair data_filename = os.path.join(data_directory, meta_dictionary['data_filename']) import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_dictionary, incremental=True) - # TODO we could also validate not supported types after loading is done? if MetaFileTypes.SAMPLE_ATTRIBUTES in meta_file_type_to_meta_files: - # TODO What if we have multiple clinical sample files? Throw exception or upload - meta_filename, meta_dictionary = meta_file_type_to_meta_files[MetaFileTypes.SAMPLE_ATTRIBUTES][0] case_list_dirname = os.path.join(data_directory, 'case_lists') - update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir=case_list_dirname if os.path.isdir(case_list_dirname) else None) + sample_attributes_metas = meta_file_type_to_meta_files[MetaFileTypes.SAMPLE_ATTRIBUTES] + for meta_pair in sample_attributes_metas: + meta_filename, meta_dictionary = meta_pair + LOGGER.info('Updating case lists with sample ids', extra={'filename_': meta_filename}) + update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir=case_list_dirname if os.path.isdir(case_list_dirname) else None) def usage(): diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index 14627b88..70c0bc5c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -26,6 +26,7 @@ import org.mskcc.cbio.portal.dao.DaoSampleList; import org.mskcc.cbio.portal.model.CancerStudy; import org.mskcc.cbio.portal.model.SampleList; +import org.mskcc.cbio.portal.util.ProgressMonitor; import java.io.*; import java.util.*; @@ -75,7 +76,11 @@ private Map> readCaseListFiles() { throw new RuntimeException(caseListFile.getAbsolutePath() + ": No cancer_study_identifier specified."); } if (!studyId.equals(this.cancerStudyStableId)) { - throw new RuntimeException(caseListFile.getAbsolutePath() + ": cancer_study_identifier expected to be " + this.cancerStudyStableId + " but found to be " + studyId); + ProgressMonitor.logWarning( + String.format( + "Skipping %s case list file as it belongs to %s study and we uploading %s study.", + caseListFile, studyId, this.cancerStudyStableId)); + continue; } String caseListStableId = properties.getProperty("stable_id"); if (caseListStableId == null || caseListStableId.trim().equals("")) { From 99550b577836e89edb6bbdabd669cfbc3c774733 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Apr 2024 21:37:20 +0200 Subject: [PATCH 033/130] Extract and reuse common logic to read and validate case lists --- .../cbio/portal/scripts/ImportSampleList.java | 53 ++++++------------- .../scripts/UpdateCaseListsSampleIds.java | 38 ++++--------- .../org/mskcc/cbio/portal/util/CaseList.java | 48 +++++++++++++++++ .../cbio/portal/util/CaseListReader.java | 43 +++++++++++++++ .../portal/validate/CaseListValidator.java | 48 +++++++++++++++++ 5 files changed, 164 insertions(+), 66 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/util/CaseList.java create mode 100644 src/main/java/org/mskcc/cbio/portal/util/CaseListReader.java create mode 100644 src/main/java/org/mskcc/cbio/portal/validate/CaseListValidator.java diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java index 411de87f..7e94efb2 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java @@ -35,6 +35,7 @@ import org.mskcc.cbio.portal.model.*; import org.mskcc.cbio.portal.dao.*; import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.validate.CaseListValidator; import java.io.*; import java.util.*; @@ -46,50 +47,26 @@ public class ImportSampleList extends ConsoleRunnable { public static void importSampleList(File dataFile) throws IOException, DaoException { ProgressMonitor.setCurrentMessage("Read data from: " + dataFile.getAbsolutePath()); - Properties properties = new TrimmedProperties(); - properties.load(new FileInputStream(dataFile)); + CaseList caseList = CaseListReader.readFile(dataFile); + CaseListValidator.validateAll(caseList); - String stableId = properties.getProperty("stable_id"); - - if (stableId.contains(" ")) { - throw new IllegalArgumentException("stable_id cannot contain spaces: " + stableId); - } - - if (stableId == null || stableId.length() == 0) { - throw new IllegalArgumentException("stable_id is not specified."); - } - - String cancerStudyIdentifier = properties.getProperty("cancer_study_identifier"); - if (cancerStudyIdentifier == null) { - throw new IllegalArgumentException("cancer_study_identifier is not specified."); - } SpringUtil.initDataSource(); - CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyIdentifier); + CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(caseList.getCancerStudyIdentifier()); if (theCancerStudy == null) { throw new IllegalArgumentException("cancer study identified by cancer_study_identifier '" - + cancerStudyIdentifier + "' not found in dbms or inaccessible to user."); + + caseList.getCancerStudyIdentifier() + "' not found in dbms or inaccessible to user."); } - String sampleListName = properties.getProperty("case_list_name"); - - String sampleListCategoryStr = properties.getProperty("case_list_category"); + String sampleListCategoryStr = caseList.getCategory(); if (sampleListCategoryStr == null || sampleListCategoryStr.length() == 0) { sampleListCategoryStr = "other"; } SampleListCategory sampleListCategory = SampleListCategory.get(sampleListCategoryStr); - String sampleListDescription = properties.getProperty("case_list_description"); - String sampleListStr = properties.getProperty("case_list_ids"); - if (sampleListName == null) { - throw new IllegalArgumentException("case_list_name is not specified."); - } else if (sampleListDescription == null) { - throw new IllegalArgumentException("case_list_description is not specified."); - } - boolean itemsAddedViaPatientLink = false; // construct sample id list ArrayList sampleIDsList = new ArrayList(); - String[] sampleIds = sampleListStr.split("\t"); + List sampleIds = caseList.getSampleIds(); for (String sampleId : sampleIds) { sampleId = StableIdUtil.getSampleId(sampleId); Sample s = DaoSample.getSampleByCancerStudyAndSampleId(theCancerStudy.getInternalId(), sampleId); @@ -110,31 +87,31 @@ public static void importSampleList(File dataFile) throws IOException, DaoExcept } else if (!sampleIDsList.contains(s.getStableId())) { sampleIDsList.add(s.getStableId()); } else { - ProgressMonitor.logWarning("Warning: duplicated sample ID "+s.getStableId()+" in case list "+stableId); + ProgressMonitor.logWarning("Warning: duplicated sample ID "+s.getStableId()+" in case list "+caseList.getStableId()); } } DaoSampleList daoSampleList = new DaoSampleList(); - SampleList sampleList = daoSampleList.getSampleListByStableId(stableId); + SampleList sampleList = daoSampleList.getSampleListByStableId(caseList.getStableId()); if (sampleList != null) { - throw new IllegalArgumentException("Patient list with this stable Id already exists: " + stableId); + throw new IllegalArgumentException("Patient list with this stable Id already exists: " + caseList.getStableId()); } sampleList = new SampleList(); - sampleList.setStableId(stableId); + sampleList.setStableId(caseList.getStableId()); int cancerStudyId = theCancerStudy.getInternalId(); sampleList.setCancerStudyId(cancerStudyId); sampleList.setSampleListCategory(sampleListCategory); - sampleList.setName(sampleListName); - sampleList.setDescription(sampleListDescription); + sampleList.setName(caseList.getName()); + sampleList.setDescription(caseList.getDescription()); sampleList.setSampleList(sampleIDsList); daoSampleList.addSampleList(sampleList); - sampleList = daoSampleList.getSampleListByStableId(stableId); + sampleList = daoSampleList.getSampleListByStableId(caseList.getStableId()); ProgressMonitor.setCurrentMessage(" --> stable ID: " + sampleList.getStableId()); ProgressMonitor.setCurrentMessage(" --> sample list name: " + sampleList.getName()); - ProgressMonitor.setCurrentMessage(" --> number of samples in file: " + sampleIds.length); + ProgressMonitor.setCurrentMessage(" --> number of samples in file: " + sampleIds.size()); String warningSamplesViaPatientLink = (itemsAddedViaPatientLink? "(nb: can be higher if samples were added via patient link)" : ""); ProgressMonitor.setCurrentMessage(" --> number of samples stored in final sample list " + warningSamplesViaPatientLink + ": " + sampleIDsList.size()); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index 70c0bc5c..2c581ffc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -26,13 +26,15 @@ import org.mskcc.cbio.portal.dao.DaoSampleList; import org.mskcc.cbio.portal.model.CancerStudy; import org.mskcc.cbio.portal.model.SampleList; +import org.mskcc.cbio.portal.util.CaseList; +import org.mskcc.cbio.portal.util.CaseListReader; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.validate.CaseListValidator; import java.io.*; import java.util.*; import java.util.stream.Collectors; -//TODO Can we unify this class with ImportSampleList? public class UpdateCaseListsSampleIds extends ConsoleRunnable { private File metaFile; @@ -61,45 +63,25 @@ public void run() { updateCaseLists(this.caseListSampleIdToSampleIds); } - // TODO Can we reuse this logic in ImportSampleList.importSampleList(File dataFile) as well private Map> readCaseListFiles() { LinkedHashMap> result = new LinkedHashMap<>(); for (File caseListFile: this.caseListFiles) { - Properties properties = new TrimmedProperties(); - try { - properties.load(new FileReader(caseListFile)); - } catch (IOException e) { - throw new RuntimeException(e); - } - String studyId = properties.getProperty("cancer_study_identifier"); - if (studyId == null || studyId.trim().equals("")) { - throw new RuntimeException(caseListFile.getAbsolutePath() + ": No cancer_study_identifier specified."); - } - if (!studyId.equals(this.cancerStudyStableId)) { + CaseList caseList = CaseListReader.readFile(caseListFile); + CaseListValidator.validateIdFields(caseList); + String cancerStudyIdentifier = caseList.getCancerStudyIdentifier(); + if (!cancerStudyIdentifier.equals(this.cancerStudyStableId)) { ProgressMonitor.logWarning( String.format( "Skipping %s case list file as it belongs to %s study and we uploading %s study.", - caseListFile, studyId, this.cancerStudyStableId)); + caseListFile, cancerStudyIdentifier, this.cancerStudyStableId)); continue; } - String caseListStableId = properties.getProperty("stable_id"); - if (caseListStableId == null || caseListStableId.trim().equals("")) { - throw new RuntimeException(caseListFile.getAbsolutePath() + ": No stable_id specified."); - } - String caseListSampleIds = properties.getProperty("case_list_ids"); - if (caseListSampleIds == null || caseListSampleIds.trim().equals("")) { - throw new RuntimeException(caseListFile.getAbsolutePath() + ": No case_list_ids specified."); - } - Set sampleIds = Arrays.stream(caseListSampleIds.split("\t")).map(sampleId -> sampleId.trim()).filter(sampleId -> !"".equals(sampleId.trim())).collect(Collectors.toSet()); - if (sampleIds.isEmpty()) { - throw new RuntimeException(caseListFile.getAbsolutePath() + ": No sample ids specified."); - } - LinkedHashSet extraSampleIds = new LinkedHashSet<>(sampleIds); + LinkedHashSet extraSampleIds = new LinkedHashSet<>(caseList.getSampleIds()); extraSampleIds.removeAll(this.allSampleIds); if (!extraSampleIds.isEmpty()) { throw new RuntimeException(caseListFile.getAbsolutePath() + ": The following sample ids present in the case list file, but not specified in the clinical sample file: " + String.join(", ", extraSampleIds)); } - result.put(caseListStableId, sampleIds); + result.put(caseList.getStableId(), new LinkedHashSet<>(caseList.getSampleIds())); } return result; } diff --git a/src/main/java/org/mskcc/cbio/portal/util/CaseList.java b/src/main/java/org/mskcc/cbio/portal/util/CaseList.java new file mode 100644 index 00000000..5e01c984 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/CaseList.java @@ -0,0 +1,48 @@ +package org.mskcc.cbio.portal.util; + +import java.util.List; + +public class CaseList { + + private final String stableId; + private final String cancerStudyIdentifier; + private final String name; + private final String description; + + private final String category; + private final List sampleIds; + + CaseList(String stableId, String cancerStudyIdentifier, String name, String description, String category, List sampleIds) { + this.stableId = stableId; + this.cancerStudyIdentifier = cancerStudyIdentifier; + this.name = name; + this.description = description; + this.category = category; + this.sampleIds = sampleIds; + } + + public String getStableId() { + return stableId; + } + + public String getCancerStudyIdentifier() { + return cancerStudyIdentifier; + } + + public String getName() { + return name; + } + + public String getCategory() { + return category; + } + + public String getDescription() { + return description; + } + + public List getSampleIds() { + return sampleIds; + } + +} diff --git a/src/main/java/org/mskcc/cbio/portal/util/CaseListReader.java b/src/main/java/org/mskcc/cbio/portal/util/CaseListReader.java new file mode 100644 index 00000000..02c15763 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/CaseListReader.java @@ -0,0 +1,43 @@ +package org.mskcc.cbio.portal.util; + +import org.mskcc.cbio.portal.scripts.TrimmedProperties; + +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; +import java.util.stream.Collectors; + +public class CaseListReader { + + public static CaseList readFile(File caseListFile) { + Properties properties = new TrimmedProperties(); + try { + properties.load(new FileReader(caseListFile)); + } catch (IOException e) { + throw new RuntimeException(e); + } + + String stableId = properties.getProperty("stable_id"); + String cancerStudyIdentifier = properties.getProperty("cancer_study_identifier"); + String caseListName = properties.getProperty("case_list_name"); + String caseListDescription = properties.getProperty("case_list_description"); + String caseListCategory = properties.getProperty("case_list_category"); + String caseListIds = properties.getProperty("case_list_ids"); + List sampleIds = caseListIds == null ? List.of() + : Arrays.stream(caseListIds.split("\t")).toList(); + + return new CaseList( + stableId, + cancerStudyIdentifier, + caseListName, + caseListDescription, + caseListCategory, + sampleIds + ); + } + + +} diff --git a/src/main/java/org/mskcc/cbio/portal/validate/CaseListValidator.java b/src/main/java/org/mskcc/cbio/portal/validate/CaseListValidator.java new file mode 100644 index 00000000..d6a2494e --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/validate/CaseListValidator.java @@ -0,0 +1,48 @@ +package org.mskcc.cbio.portal.validate; + +import org.mskcc.cbio.portal.util.CaseList; + +public class CaseListValidator { + + /** + * Fields that are used during case list update + * @param caseList + */ + public static void validateIdFields(CaseList caseList) { + if (caseList.getStableId() == null) { + throw new IllegalArgumentException("stable id is not specified."); + } + if (caseList.getStableId().matches(".*\\s.*")) { + throw new IllegalArgumentException(String.format("stable id cannot contain white space(s): '%s'", caseList.getStableId())); + } + if (caseList.getCancerStudyIdentifier() == null) { + throw new IllegalArgumentException("cancer study identifier is not specified."); + } + if (caseList.getCancerStudyIdentifier().matches(".*\\s.*")) { + throw new IllegalArgumentException(String.format("cancer study identifier cannot contain white space(s): '%s'", caseList.getStableId())); + } + if (caseList.getSampleIds() == null || caseList.getSampleIds().isEmpty()) { + throw new IllegalArgumentException("sample ids are not specified."); + } + } + + /** + * Fields that are used during case list creation + * @param caseList + */ + public static void validateDescriptionFields(CaseList caseList) { + if (caseList.getName() == null) { + throw new IllegalArgumentException("case list name is not specified."); + } + if (caseList.getDescription() == null) { + throw new IllegalArgumentException("case list description is not specified."); + } + } + + public static void validateAll(CaseList caseList) { + validateIdFields(caseList); + validateDescriptionFields(caseList); + } + + +} From 1829842aa8a7c20d2b9a6ddca532f4d1f291f889 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 30 Apr 2024 16:01:46 +0200 Subject: [PATCH 034/130] Fix TestIntegrationTest - change location of the files - make sure assertions could work on the seed mini db - get rid from absent cbioportal dependencies --- .../scripts/TestIntegrationTest.java | 124 ++++++++++-------- src/test/resources/integrationTestScript.xml | 64 ++++----- 2 files changed, 98 insertions(+), 90 deletions(-) diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestIntegrationTest.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestIntegrationTest.java index bc3bbac1..dc76865f 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestIntegrationTest.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestIntegrationTest.java @@ -28,16 +28,8 @@ import com.fasterxml.jackson.core.JsonParseException; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; -import org.cbioportal.model.GenericAssayData; -import org.cbioportal.model.GenesetMolecularData; -import org.cbioportal.model.StructuralVariant; -import org.cbioportal.model.StructuralVariantQuery; -import org.cbioportal.persistence.PersistenceConstants; -import org.cbioportal.service.GenericAssayService; -import org.cbioportal.service.GenesetDataService; -import org.cbioportal.service.StructuralVariantService; +import org.cbioportal.model.GeneticEntity; import org.junit.Before; -import org.junit.Ignore; import org.junit.Test; import org.junit.runner.RunWith; import org.mskcc.cbio.portal.dao.DaoCancerStudy; @@ -51,7 +43,9 @@ import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.dao.DaoGistic; import org.mskcc.cbio.portal.dao.DaoMutation; +import org.mskcc.cbio.portal.dao.DaoSample; import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.dao.DaoStructuralVariant; import org.mskcc.cbio.portal.dao.DaoTypeOfCancer; import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CancerStudy; @@ -59,9 +53,12 @@ import org.mskcc.cbio.portal.model.ClinicalAttribute; import org.mskcc.cbio.portal.model.ClinicalData; import org.mskcc.cbio.portal.model.ExtendedMutation; +import org.mskcc.cbio.portal.model.Geneset; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.model.Gistic; +import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.model.SampleList; +import org.mskcc.cbio.portal.model.StructuralVariant; import org.mskcc.cbio.portal.model.TypeOfCancer; import org.mskcc.cbio.portal.scripts.ImportGenePanel; import org.mskcc.cbio.portal.util.ConsoleUtil; @@ -81,12 +78,12 @@ import java.io.InputStream; import java.util.ArrayList; import java.util.Arrays; -import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -129,17 +126,15 @@ public void setUp() throws DaoException, JsonParseException, JsonMappingExceptio * * @throws Throwable */ - @Ignore("Skip TestIntegrationTest.testLoadStudyEs0 due to NullPointerException") @Test public void testLoadStudyEs0() throws Throwable { try { // === assumptions that we rely upon in the checks later on: ==== - // assumption 1: there are no clinical attributes at the start of the test: - assertEquals(0, DaoClinicalAttributeMeta.getAllMap().size()); - // use this to get progress info/troubleshoot: // ProgressMonitor.setConsoleMode(true); + int numberOfMutationsInDb = DaoMutation.getAllMutations().size(); + // ==== Load the data ==== TransactionalScripts scripts = applicationContext.getBean(TransactionalScripts.class); scripts.run(); @@ -167,29 +162,25 @@ public void testLoadStudyEs0() throws Throwable { List mutations = DaoMutation.getAllMutations(); // check number of mutation records in the database // 3 in seed_mini.sql + 33 study_es_0/data_mutations_extended.maf (2 silent ignored)) - // so we expect 34 records in DB: - assertEquals(34, mutations.size()); + // so we expect +34 records in DB: + assertEquals(numberOfMutationsInDb + 34, mutations.size()); //===== Check STRUCTURAL VARIANT data ======== - // 45 structural variant events are imported, using 31 unique genes, using 39 samples - // Not all 31 genes have to be queried. BRAF is fused to many of the test genes. - List entrezGeneIds = new ArrayList(Arrays.asList(57670, 673, 8031, 5979, 27436, 238, 7113, 2078, 1956, 238, 5774, 2115, 7273)); - // Add samples and molecular profile IDs - List sampleIds = new ArrayList(Arrays.asList("TCGA-A2-A04P-01", "TCGA-A1-A0SB-01", "TCGA-A1-A0SB-01", "TCGA-A2-A04P-01", "TCGA-A2-A04P-01", "TCGA-A1-A0SK-01", "TCGA-A2-A0CM-01", "TCGA-AR-A1AR-01", "TCGA-B6-A0WX-01", "TCGA-BH-A1F0-01", "TCGA-B6-A0I6-01", "TCGA-BH-A18V-01", "TCGA-BH-A18Q-01", "TCGA-BH-A18K-01", "TCGA-BH-A0HL-01", "TCGA-BH-A0E0-01", "TCGA-BH-A0RX-01", "TCGA-A7-A13D-01", "TCGA-BH-A0E6-01", "TCGA-AO-A0J4-01", "TCGA-A7-A0CE-01", "TCGA-A7-A13E-01", "TCGA-A7-A0DA-01", "TCGA-D8-A142-01", "TCGA-D8-A143-01", "TCGA-AQ-A04J-01", "TCGA-BH-A0HN-01", "TCGA-A2-A0T0-01", "TCGA-A2-A0YE-01", "TCGA-A2-A0YJ-01", "TCGA-A2-A0D0-01", "TCGA-A2-A04U-01", "TCGA-AO-A0J6-01", "TCGA-A2-A0YM-01", "TCGA-A2-A0D2-01", "TCGA-BH-A0B3-01", "TCGA-A2-A04Q-01", "TCGA-A2-A0SX-01", "TCGA-AO-A0JL-01")); - List geneticProfileStableIds = new ArrayList(); - geneticProfileStableIds = Collections.nCopies(sampleIds.size(), "study_es_0_structural_variants"); + GeneticProfile svGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_es_0_structural_variants"); - StructuralVariantService structuralVariantService = applicationContext.getBean(StructuralVariantService.class); - List noStructVars = Collections.emptyList(); - List structuralVariants = structuralVariantService.fetchStructuralVariants(geneticProfileStableIds, sampleIds, entrezGeneIds, noStructVars); + List structuralVariants = DaoStructuralVariant.getAllStructuralVariants() + .stream() + .filter(sv -> + sv.getGeneticProfileId() == svGeneticProfile.getGeneticProfileId() + ) + .collect(Collectors.toList()); - // Check if all 45 structural variants are imported - assertEquals(45, structuralVariants.size()); + // Check if all 48 structural variants are imported + assertEquals(48, structuralVariants.size()); //===== Check CNA data ======== DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - ArrayList hugoGeneSymbols = new ArrayList(Arrays.asList("ACAP3","AGRN","ATAD3A","ATAD3B","ATAD3C","AURKAIP1","ERCC5")); ArrayList entrezIds = new ArrayList(Arrays.asList(116983L, 375790L, 55210L, 83858L, 219293L, 54998L, 2073L)); GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_es_0_gistic"); int countAMP_DEL = 0; @@ -286,7 +277,7 @@ public void testLoadStudyEs0() throws Throwable { // ===== check gistic data // servlet uses this query: ArrayList gistics = DaoGistic.getAllGisticByCancerStudyId(cancerStudy.getInternalId()); - assertEquals(12, gistics.size()); + assertEquals(11, gistics.size()); Gistic gisticChr10 = null, gisticChr20 = null; for (Gistic gistic : gistics) { if (gistic.getChromosome() == 20) { @@ -340,53 +331,70 @@ public void testLoadStudyEs0() throws Throwable { // ===== check mutational signature String testMutationalSignatureStableIds = "mean_1"; + GeneticEntity mutationSignatureGeneticEntity = DaoGeneticEntity.getGeneticEntityByStableId(testMutationalSignatureStableIds); + assertNotNull(mutationSignatureGeneticEntity); + String testMutationalSignatureMolecularProfileIds = "study_es_0_mutational_signature"; - assertNotNull(DaoGeneticEntity.getGeneticEntityByStableId(testMutationalSignatureStableIds)); + GeneticProfile mutationSignatureProfile = DaoGeneticProfile.getGeneticProfileByStableId(testMutationalSignatureMolecularProfileIds); + assertNotNull(mutationSignatureProfile); // ENTITY_STABLE_ID name description TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 // TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 // TCGA-BH-A0HP-01 TCGA-BH-A18P-01 // mean_1 ... ... ... 0.370266873 0.010373016 0.005419294 0.022753384 0.037687823 0.016708976 0.100042446 0.104214723 - GenericAssayService genericAssayService = applicationContext.getBean(GenericAssayService.class); - List mutationalSignatureData = genericAssayService.fetchGenericAssayData(Arrays.asList(testMutationalSignatureMolecularProfileIds), - Arrays.asList("TCGA-A1-A0SB-01", "TCGA-A1-A0SH-01"), Arrays.asList(testMutationalSignatureStableIds), PersistenceConstants.SUMMARY_PROJECTION); - assertEquals(2, mutationalSignatureData.size()); - assertEquals("0.370266873", mutationalSignatureData.get(0).getValue()); - assertEquals("0.022753384", mutationalSignatureData.get(1).getValue()); + HashMap mutationalSignatureData = DaoGeneticAlteration + .getInstance() + .getGeneticAlterationMapForEntityIds( + mutationSignatureProfile.getGeneticProfileId(), + List.of(mutationSignatureGeneticEntity.getId())).get(mutationSignatureGeneticEntity.getId()); + Sample sbSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), "TCGA-A1-A0SB-01"); + Sample shSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), "TCGA-A1-A0SH-01"); + assertEquals("0.370266873", mutationalSignatureData.get(sbSample.getInternalId())); + assertEquals("0.022753384", mutationalSignatureData.get(shSample.getInternalId())); // ===== check GSVA data // ... - String testGeneset = "GO_ATP_DEPENDENT_CHROMATIN_REMODELING"; - assertEquals(4, DaoGeneset.getGenesetByExternalId(testGeneset).getGenesetGeneIds().size()); + String testGenesetExternalId = "GO_ATP_DEPENDENT_CHROMATIN_REMODELING"; + Geneset testGeneset = DaoGeneset.getGenesetByExternalId(testGenesetExternalId); + assertEquals(4, testGeneset.getGenesetGeneIds().size()); // scores: TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 // TCGA-A2-A04U-01 // GO_ATP_DEPENDENT_CHROMATIN_REMODELING -0.293861251463613 -0.226227563676626 // -0.546556962547473 -0.0811115513543749 0.56919171543422 // using new api: - GenesetDataService genesetDataService = applicationContext.getBean(GenesetDataService.class); - List genesetData = genesetDataService.fetchGenesetData("study_es_0_gsva_scores", - "study_es_0_all", Arrays.asList(testGeneset)); + GeneticProfile gsvaScoresProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_es_0_gsva_scores"); + HashMap genesetData = DaoGeneticAlteration + .getInstance() + .getGeneticAlterationMapForEntityIds( + gsvaScoresProfile.getGeneticProfileId(), + List.of(testGeneset.getGeneticEntityId())).get(testGeneset.getGeneticEntityId()); assertEquals(5, genesetData.size()); - genesetData = genesetDataService.fetchGenesetData("study_es_0_gsva_scores", - Arrays.asList("TCGA-A1-A0SB-01", "TCGA-A1-A0SH-01"), Arrays.asList(testGeneset)); - assertEquals(2, genesetData.size()); - assertEquals(-0.293861251463613, Double.parseDouble(genesetData.get(0).getValue()), 0.00001); - assertEquals(-0.0811115513543749, Double.parseDouble(genesetData.get(1).getValue()), 0.00001); + String sbSampleGenesetValueString = genesetData.get(sbSample.getInternalId()); + String shSampleGenesetValuesString = genesetData.get(shSample.getInternalId()); + assertEquals(-0.293861251463613, Double.parseDouble(sbSampleGenesetValueString), 0.00001); + assertEquals(-0.0811115513543749, Double.parseDouble(shSampleGenesetValuesString), 0.00001); // ===== check treatment (profile) data // ... - String testTreatment = "Irinotecan"; - assertNotNull(DaoGeneticEntity.getGeneticEntityByStableId(testTreatment)); + String testTreatmentStableId = "Irinotecan"; + GeneticEntity testTreatmentGeneticEntity = DaoGeneticEntity.getGeneticEntityByStableId(testTreatmentStableId); + assertNotNull(testTreatmentGeneticEntity); // ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 // TCGA-A1-A0SE-01 TCGA-A1-A0SH-01 TCGA-A2-A04U-01 TCGA-B6-A0RS-01 // TCGA-BH-A0HP-01 TCGA-BH-A18P-01 // Irinotecan ... ... ... NA 0.080764666 NA 0.06704437 0.069568723 0.034992039 // 0.740817904 0.209220141 - GenericAssayService treatmentDataService = applicationContext.getBean(GenericAssayService.class); - List treatmentData = treatmentDataService.getGenericAssayData("study_es_0_treatment_ic50", "study_es_0_all", Arrays.asList(testTreatment), PersistenceConstants.SUMMARY_PROJECTION); + GeneticProfile treatmentIc50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_es_0_treatment_ic50"); + HashMap treatmentData = DaoGeneticAlteration + .getInstance() + .getGeneticAlterationMapForEntityIds( + treatmentIc50Profile.getGeneticProfileId(), + List.of(testTreatmentGeneticEntity.getId())).get(testTreatmentGeneticEntity.getId()); assertEquals(8, treatmentData.size()); - assertEquals("NA", treatmentData.get(0).getValue()); - assertEquals(0.080764666, Double.parseDouble(treatmentData.get(1).getValue()), 0.00001); + String sbSampleIrinotecanTraetmentValuesString = treatmentData.get(sbSample.getInternalId()); + assertEquals("NA", sbSampleIrinotecanTraetmentValuesString); + String shSampleIrinotecanTraetmentValuesString = treatmentData.get(shSample.getInternalId()); + assertEquals(0.06704437, Double.parseDouble(shSampleIrinotecanTraetmentValuesString), 0.00001); // ===== check study status assertEquals(DaoCancerStudy.Status.AVAILABLE, DaoCancerStudy.getStatus("study_es_0")); @@ -419,7 +427,7 @@ private void loadGenes() throws DaoException, JsonParseException, JsonMappingExc Map> aliasesMap = new HashMap>(); InputStream inputStream = new FileInputStream( - "src/test/scripts/test_data/api_json_system_tests/genesaliases.json"); + "tests/test_data/api_json_system_tests/genesaliases.json"); // parse json file: ObjectMapper mapper = new ObjectMapper(); TestGeneAlias[] genesAliases = mapper.readValue(inputStream, TestGeneAlias[].class); @@ -434,7 +442,7 @@ private void loadGenes() throws DaoException, JsonParseException, JsonMappingExc aliases.add(testGeneAlias.geneAlias); } - inputStream = new FileInputStream("src/test/scripts/test_data/api_json_system_tests/genes.json"); + inputStream = new FileInputStream("tests/test_data/api_json_system_tests/genes.json"); // parse json file: mapper = new ObjectMapper(); TestGene[] genes = mapper.readValue(inputStream, TestGene[].class); @@ -457,9 +465,9 @@ private void loadGenes() throws DaoException, JsonParseException, JsonMappingExc */ private void loadGenePanel() throws Exception { ImportGenePanel gp = new ImportGenePanel(null); - gp.setFile(new File("src/test/scripts/test_data/study_es_0/data_gene_panel_testpanel1.txt")); + gp.setFile(new File("tests/test_data/study_es_0/data_gene_panel_testpanel1.txt")); gp.importData(); - gp.setFile(new File("src/test/scripts/test_data/study_es_0/data_gene_panel_testpanel2.txt")); + gp.setFile(new File("tests/test_data/study_es_0/data_gene_panel_testpanel2.txt")); gp.importData(); } @@ -473,7 +481,7 @@ static class TestGene { @JsonIgnoreProperties(ignoreUnknown = true) static class TestGeneAlias { - @JsonProperty("gene_alias") + @JsonProperty("alias") String geneAlias; @JsonProperty("entrezGeneId") int entrezGeneId; diff --git a/src/test/resources/integrationTestScript.xml b/src/test/resources/integrationTestScript.xml index 395272de..87ed6e8f 100644 --- a/src/test/resources/integrationTestScript.xml +++ b/src/test/resources/integrationTestScript.xml @@ -25,7 +25,7 @@ org.mskcc.cbio.portal.scripts.ImportTypesOfCancers - src/test/scripts/test_data/study_es_0/data_cancer_type.txt + tests/test_data/study_es_0/data_cancer_type.txt false --noprogress @@ -47,25 +47,25 @@ org.mskcc.cbio.portal.scripts.ImportCancerStudy - src/test/scripts/test_data/study_es_0/meta_study.txt + tests/test_data/study_es_0/meta_study.txt org.mskcc.cbio.portal.scripts.ImportClinicalData --data - src/test/scripts/test_data/study_es_0/data_clinical_samples.txt + tests/test_data/study_es_0/data_clinical_samples.txt --meta - src/test/scripts/test_data/study_es_0/meta_clinical_samples.txt + tests/test_data/study_es_0/meta_clinical_samples.txt org.mskcc.cbio.portal.scripts.ImportClinicalData --data - src/test/scripts/test_data/study_es_0/data_clinical_patients.txt + tests/test_data/study_es_0/data_clinical_patients.txt --meta - src/test/scripts/test_data/study_es_0/meta_clinical_patients.txt + tests/test_data/study_es_0/meta_clinical_patients.txt @@ -74,9 +74,9 @@ org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData --data - src/test/scripts/test_data/study_es_0/data_cna_hg19.seg + tests/test_data/study_es_0/data_cna_hg19.seg --meta - src/test/scripts/test_data/study_es_0/meta_cna_hg19_seg.txt + tests/test_data/study_es_0/meta_cna_hg19_seg.txt --loadMode bulkload --noprogress @@ -86,9 +86,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_mutations_extended.maf + tests/test_data/study_es_0/data_mutations_extended.maf --meta - src/test/scripts/test_data/study_es_0/meta_mutations_extended.txt + tests/test_data/study_es_0/meta_mutations_extended.txt --loadMode bulkload --noprogress @@ -98,9 +98,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_cna_discrete.txt + tests/test_data/study_es_0/data_cna_discrete.txt --meta - src/test/scripts/test_data/study_es_0/meta_cna_discrete.txt + tests/test_data/study_es_0/meta_cna_discrete.txt --loadMode bulkload --noprogress @@ -110,9 +110,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_expression_median.txt + tests/test_data/study_es_0/data_expression_median.txt --meta - src/test/scripts/test_data/study_es_0/meta_expression_median.txt + tests/test_data/study_es_0/meta_expression_median.txt --loadMode bulkload --noprogress @@ -122,9 +122,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_structural_variants.txt + tests/test_data/study_es_0/data_structural_variants.txt --meta - src/test/scripts/test_data/study_es_0/meta_structural_variants.txt + tests/test_data/study_es_0/meta_structural_variants.txt --loadMode bulkload --noprogress @@ -134,7 +134,7 @@ org.mskcc.cbio.portal.scripts.ImportGisticData --data - src/test/scripts/test_data/study_es_0/data_gistic_genes_amp.txt + tests/test_data/study_es_0/data_gistic_genes_amp.txt --study study_es_0 --noprogress @@ -144,9 +144,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_cna_log2.txt + tests/test_data/study_es_0/data_cna_log2.txt --meta - src/test/scripts/test_data/study_es_0/meta_cna_log2.txt + tests/test_data/study_es_0/meta_cna_log2.txt --loadMode bulkload --noprogress @@ -156,9 +156,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_methylation_hm27.txt + tests/test_data/study_es_0/data_methylation_hm27.txt --meta - src/test/scripts/test_data/study_es_0/meta_methylation_hm27.txt + tests/test_data/study_es_0/meta_methylation_hm27.txt --loadMode bulkload --noprogress @@ -168,9 +168,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_gsva_scores.txt + tests/test_data/study_es_0/data_gsva_scores.txt --meta - src/test/scripts/test_data/study_es_0/meta_gsva_scores.txt + tests/test_data/study_es_0/meta_gsva_scores.txt --loadMode bulkload --noprogress @@ -178,9 +178,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_gsva_pvalues.txt + tests/test_data/study_es_0/data_gsva_pvalues.txt --meta - src/test/scripts/test_data/study_es_0/meta_gsva_pvalues.txt + tests/test_data/study_es_0/meta_gsva_pvalues.txt --loadMode bulkload --noprogress @@ -190,9 +190,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_treatment_ic50.txt + tests/test_data/study_es_0/data_treatment_ic50.txt --meta - src/test/scripts/test_data/study_es_0/meta_treatment_ic50.txt + tests/test_data/study_es_0/meta_treatment_ic50.txt --loadMode bulkload --noprogress @@ -200,9 +200,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_treatment_ec50.txt + tests/test_data/study_es_0/data_treatment_ec50.txt --meta - src/test/scripts/test_data/study_es_0/meta_treatment_ec50.txt + tests/test_data/study_es_0/meta_treatment_ec50.txt --loadMode bulkload --noprogress @@ -212,9 +212,9 @@ org.mskcc.cbio.portal.scripts.ImportProfileData --data - src/test/scripts/test_data/study_es_0/data_mutational_signature.txt + tests/test_data/study_es_0/data_mutational_signature.txt --meta - src/test/scripts/test_data/study_es_0/meta_mutational_signature.txt + tests/test_data/study_es_0/meta_mutational_signature.txt --loadMode bulkload --noprogress @@ -223,7 +223,7 @@ org.mskcc.cbio.portal.scripts.ImportSampleList - src/test/scripts/test_data/study_es_0/case_lists/cases_custom.txt + tests/test_data/study_es_0/case_lists/cases_custom.txt From e785a536e799e072c76c30757b878439756c2b40 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 30 Apr 2024 16:12:51 +0200 Subject: [PATCH 035/130] Revert RESOURCE_DEFINITION_DICTIONARY initialsation to empty set --- scripts/importer/validateData.py | 35 +++++++++----------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index aebb3ea1..b5f3f3f6 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -75,7 +75,7 @@ sample_ids_panel_dict = {} # resource globals -RESOURCE_DEFINITION_DICTIONARY = None +RESOURCE_DEFINITION_DICTIONARY = {} RESOURCE_PATIENTS_WITH_SAMPLES = None # globals required for gene set scoring validation @@ -3385,7 +3385,7 @@ def checkLine(self, data): sample_ids_panel_dict[sample_id] = data[self.mutation_stable_id_index - 1] # Sample ID has been removed from list, so subtract 1 position. if data[self.mutation_stable_id_index - 1] != 'NA': - if mutation_sample_ids is not None and sample_id not in mutation_sample_ids: + if sample_id not in mutation_sample_ids: self.logger.error('Sample ID has mutation gene panel, but is not in the sequenced case list', extra={'line_number': self.line_number, 'cause': sample_id}) @@ -3790,8 +3790,7 @@ def checkLine(self, data): 'column_number': col_index + 1, 'cause': value}) # make sure that RESOURCE_ID is defined in the resource definition file - if RESOURCE_DEFINITION_DICTIONARY is not None \ - and value not in RESOURCE_DEFINITION_DICTIONARY: + if value not in RESOURCE_DEFINITION_DICTIONARY: self.logger.error( 'RESOURCE_ID is not defined in resource definition file', extra={'line_number': self.line_number, @@ -3858,17 +3857,13 @@ def checkLine(self, data): value = data[col_index].strip() # make sure RESOURCE_ID is defined correctly if col_name == 'RESOURCE_ID': - if RESOURCE_DEFINITION_DICTIONARY is not None \ - and (value not in RESOURCE_DEFINITION_DICTIONARY \ - or 'SAMPLE' not in RESOURCE_DEFINITION_DICTIONARY[value]): + if value not in RESOURCE_DEFINITION_DICTIONARY or 'SAMPLE' not in RESOURCE_DEFINITION_DICTIONARY[value]: self.logger.error( 'RESOURCE_ID for sample resource is not defined correctly in resource definition file', extra={'line_number': self.line_number, 'column_number': col_index + 1, 'cause': value}) - if RESOURCE_DEFINITION_DICTIONARY is not None \ - and value in RESOURCE_DEFINITION_DICTIONARY \ - and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: self.logger.warning( 'RESOURCE_ID for sample resource has been used by more than one RESOURCE_TYPE', extra={'line_number': self.line_number, @@ -3923,17 +3918,13 @@ def checkLine(self, data): value = data[col_index].strip() # make sure RESOURCE_ID is defined correctly if col_name == 'RESOURCE_ID': - if RESOURCE_DEFINITION_DICTIONARY is not None \ - and (value not in RESOURCE_DEFINITION_DICTIONARY \ - or 'PATIENT' not in RESOURCE_DEFINITION_DICTIONARY[value]): + if value not in RESOURCE_DEFINITION_DICTIONARY or 'PATIENT' not in RESOURCE_DEFINITION_DICTIONARY[value]: self.logger.error( 'RESOURCE_ID for patient resource is not defined correctly in resource definition file', extra={'line_number': self.line_number, 'column_number': col_index + 1, 'cause': value}) - if RESOURCE_DEFINITION_DICTIONARY is not None \ - and value in RESOURCE_DEFINITION_DICTIONARY \ - and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: self.logger.warning( 'RESOURCE_ID for patient resource has been used by more than one RESOURCE_TYPE', extra={'line_number': self.line_number, @@ -3977,17 +3968,13 @@ def checkLine(self, data): value = data[col_index].strip() # make sure RESOURCE_ID is defined correctly if col_name == 'RESOURCE_ID': - if RESOURCE_DEFINITION_DICTIONARY is not None \ - and (value not in RESOURCE_DEFINITION_DICTIONARY \ - or 'STUDY' not in RESOURCE_DEFINITION_DICTIONARY[value]): + if value not in RESOURCE_DEFINITION_DICTIONARY or 'STUDY' not in RESOURCE_DEFINITION_DICTIONARY[value]: self.logger.error( 'RESOURCE_ID for study resource is not defined correctly in resource definition file', extra={'line_number': self.line_number, 'column_number': col_index + 1, 'cause': value}) - if RESOURCE_DEFINITION_DICTIONARY is not None \ - and value in RESOURCE_DEFINITION_DICTIONARY \ - and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: + if value in RESOURCE_DEFINITION_DICTIONARY and len(RESOURCE_DEFINITION_DICTIONARY[value]) > 1: self.logger.warning( 'RESOURCE_ID for study resource has been used by more than one RESOURCE_TYPE', extra={'line_number': self.line_number, @@ -5570,9 +5557,7 @@ def validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_ma continue logger.info("Validating %s", meta_file_type) for validator in validators: - # TODO skip None's. Why do we even have them? - if validator: - validator.validate() + validator.validate() def get_pom_path(): From e09e1e25325b337aa80fff24436284a1d49d059f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 30 Apr 2024 17:21:14 +0200 Subject: [PATCH 036/130] Minor improvments. Apply PRs feedback --- scripts/importer/validateData.py | 1 + .../mskcc/cbio/portal/dao/DaoMutation.java | 7 +- .../mskcc/cbio/portal/dao/DaoSampleList.java | 8 +- .../portal/scripts/ImportClinicalData.java | 15 ++- .../cbio/portal/scripts/ImportSampleList.java | 7 +- .../scripts/UpdateCaseListsSampleIds.java | 109 +++++++++--------- 6 files changed, 73 insertions(+), 74 deletions(-) diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index b5f3f3f6..774597ad 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -70,6 +70,7 @@ DEFINED_SAMPLE_IDS = None DEFINED_SAMPLE_ATTRIBUTES = None PATIENTS_WITH_SAMPLES = None +DEFINED_CANCER_TYPES = None mutation_sample_ids = None mutation_file_sample_ids = set() sample_ids_panel_dict = {} diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java index 3a7472c0..b8d1e4f9 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java @@ -67,6 +67,8 @@ public final class DaoMutation { public static final String NAN = "NaN"; private static final String MUTATION_COUNT_ATTR_ID = "MUTATION_COUNT"; + public static final String DELETE_ALTERATION_DRIVER_ANNOTATION = "DELETE from alteration_driver_annotation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"; + public static final String DELETE_MUTATION = "DELETE from mutation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"; public static int addMutation(ExtendedMutation mutation, boolean newMutationEvent) throws DaoException { if (!MySQLbulkLoader.isBulkLoad()) { @@ -1499,13 +1501,12 @@ public static void deleteAllRecordsInGeneticProfileForSample(long geneticProfile ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoMutation.class); - // TODO Move it to another class? - pstmt = con.prepareStatement("DELETE from alteration_driver_annotation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"); + pstmt = con.prepareStatement(DELETE_ALTERATION_DRIVER_ANNOTATION); pstmt.setLong(1, geneticProfileId); pstmt.setLong(2, internalSampleId); pstmt.executeUpdate(); - pstmt = con.prepareStatement("DELETE from mutation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"); + pstmt = con.prepareStatement(DELETE_MUTATION); pstmt.setLong(1, geneticProfileId); pstmt.setLong(2, internalSampleId); pstmt.executeUpdate(); diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java index 72d1c1b2..cd5fef98 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java @@ -42,7 +42,9 @@ */ public class DaoSampleList { - /** + public static final String DELETE_SAMPLE_LIST_LIST = "DELETE FROM sample_list_list WHERE `LIST_ID` = ?"; + + /** * Adds record to sample_list table. */ public int addSampleList(SampleList sampleList) throws DaoException { @@ -67,7 +69,7 @@ public int addSampleList(SampleList sampleList) throws DaoException { int listListRow = addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con); rows = (listListRow != -1) ? (rows + listListRow) : rows; } else { - throw new SQLException("Creating sample list failed, no ID obtained."); + throw new DaoException("Creating sample list failed, no ID obtained."); } } } catch (SQLException e) { @@ -253,7 +255,7 @@ public void updateSampleListList(SampleList sampleList) throws DaoException { PreparedStatement pstmt = null; try { con = JdbcUtil.getDbConnection(DaoSampleList.class); - pstmt = con.prepareStatement("DELETE FROM sample_list_list WHERE `LIST_ID` = ?"); + pstmt = con.prepareStatement(DELETE_SAMPLE_LIST_LIST); pstmt.setInt(1, sampleList.getSampleListId()); pstmt.executeUpdate(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java index 11519601..0e37c9e0 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java @@ -43,7 +43,6 @@ import java.util.regex.*; import org.apache.commons.collections4.map.MultiKeyMap; -//TODO Remove MIXED_ATTRIBUTES data type https://github.com/cBioPortal/cbioportal-core/issues/31 public class ImportClinicalData extends ConsoleRunnable { public static final String DELIMITER = "\t"; @@ -104,6 +103,10 @@ public static enum AttributeTypes { PATIENT_ATTRIBUTES("PATIENT"), SAMPLE_ATTRIBUTES("SAMPLE"), + /** + * We want to encourage use patient or sample files instead, not mixed ones. + * See https://github.com/cBioPortal/cbioportal-core/issues/31 + */ @Deprecated MIXED_ATTRIBUTES("MIXED"); @@ -664,17 +667,13 @@ public void run() { attributesDatatype = properties.getProperty("datatype"); cancerStudyStableId = properties.getProperty("cancer_study_identifier"); } - if( options.has ( attributeFlag ) ) - { + if (options.has(attributeFlag)) { attributesDatatype = "MIXED_ATTRIBUTES"; } - if( options.has ( relaxedFlag ) ) - { + if (options.has(relaxedFlag)) { relaxed = true; - } - if( options.has ( overWriteExistingFlag ) ) - { + if (options.has(overWriteExistingFlag)) { overwriteExisting = true; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java index 7e94efb2..356471e7 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java @@ -66,8 +66,7 @@ public static void importSampleList(File dataFile) throws IOException, DaoExcept boolean itemsAddedViaPatientLink = false; // construct sample id list ArrayList sampleIDsList = new ArrayList(); - List sampleIds = caseList.getSampleIds(); - for (String sampleId : sampleIds) { + for (String sampleId : caseList.getSampleIds()) { sampleId = StableIdUtil.getSampleId(sampleId); Sample s = DaoSample.getSampleByCancerStudyAndSampleId(theCancerStudy.getInternalId(), sampleId); if (s==null) { @@ -87,7 +86,7 @@ public static void importSampleList(File dataFile) throws IOException, DaoExcept } else if (!sampleIDsList.contains(s.getStableId())) { sampleIDsList.add(s.getStableId()); } else { - ProgressMonitor.logWarning("Warning: duplicated sample ID "+s.getStableId()+" in case list "+caseList.getStableId()); + ProgressMonitor.logWarning("Warning: duplicated sample ID " + s.getStableId() + " in case list " + caseList.getStableId()); } } @@ -111,7 +110,7 @@ public static void importSampleList(File dataFile) throws IOException, DaoExcept ProgressMonitor.setCurrentMessage(" --> stable ID: " + sampleList.getStableId()); ProgressMonitor.setCurrentMessage(" --> sample list name: " + sampleList.getName()); - ProgressMonitor.setCurrentMessage(" --> number of samples in file: " + sampleIds.size()); + ProgressMonitor.setCurrentMessage(" --> number of samples in file: " + caseList.getSampleIds().size()); String warningSamplesViaPatientLink = (itemsAddedViaPatientLink? "(nb: can be higher if samples were added via patient link)" : ""); ProgressMonitor.setCurrentMessage(" --> number of samples stored in final sample list " + warningSamplesViaPatientLink + ": " + sampleIDsList.size()); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index 2c581ffc..b8ab514e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -31,8 +31,19 @@ import org.mskcc.cbio.portal.util.ProgressMonitor; import org.mskcc.cbio.portal.validate.CaseListValidator; -import java.io.*; -import java.util.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; import java.util.stream.Collectors; public class UpdateCaseListsSampleIds extends ConsoleRunnable { @@ -41,8 +52,8 @@ public class UpdateCaseListsSampleIds extends ConsoleRunnable { private File dataFile; private List caseListFiles = List.of(); private String cancerStudyStableId; - private Map> caseListSampleIdToSampleIds = new LinkedHashMap<>(); - private DaoSampleList daoSampleList = new DaoSampleList(); + private final Map> caseListSampleIdToSampleIds = new LinkedHashMap<>(); + private final DaoSampleList daoSampleList = new DaoSampleList(); private LinkedHashSet allSampleIds; public UpdateCaseListsSampleIds(String[] args) { @@ -56,7 +67,6 @@ public void run() { parseArguments(); readStudyIdAndDataFileFromMetaFile(); this.allSampleIds = readSampleIdsFromDataFile(this.dataFile); - // TODO has the all case list always to exist? this.caseListSampleIdToSampleIds.put(cancerStudyStableId + "_all", this.allSampleIds); Map> readCaseListSampleIds = readCaseListFiles(); this.caseListSampleIdToSampleIds.putAll(readCaseListSampleIds); @@ -65,7 +75,7 @@ public void run() { private Map> readCaseListFiles() { LinkedHashMap> result = new LinkedHashMap<>(); - for (File caseListFile: this.caseListFiles) { + for (File caseListFile : this.caseListFiles) { CaseList caseList = CaseListReader.readFile(caseListFile); CaseListValidator.validateIdFields(caseList); String cancerStudyIdentifier = caseList.getCancerStudyIdentifier(); @@ -87,34 +97,32 @@ private Map> readCaseListFiles() { } private void updateCaseLists(Map> caseListSampleIdToSampleIds) { - // TODO Do we really have to do this? Is there a better way? DaoCancerStudy.reCacheAll(); try { - for (Map.Entry> caseListStableIdToSampleIds: caseListSampleIdToSampleIds.entrySet()) { + for (Map.Entry> caseListStableIdToSampleIds : caseListSampleIdToSampleIds.entrySet()) { String caseListStableId = caseListStableIdToSampleIds.getKey(); - Set sampleIds = caseListStableIdToSampleIds.getValue(); + Set uploadedSampleIds = caseListStableIdToSampleIds.getValue(); SampleList sampleList = daoSampleList.getSampleListByStableId(caseListStableId); if (sampleList == null) { throw new RuntimeException("No case list with " + caseListStableId + " stable id is found"); } - LinkedHashSet newCaseListSampleIds = new LinkedHashSet<>(sampleIds); - newCaseListSampleIds.addAll(sampleList.getSampleList()); - ArrayList newSampleArrayList = new ArrayList<>(newCaseListSampleIds); - sampleList.setSampleList(newSampleArrayList); - //TODO no need to run expensive db update if sampleList hasn't effectively changed - daoSampleList.updateSampleListList(sampleList); + LinkedHashSet newCaseListSampleIds = new LinkedHashSet<>(sampleList.getSampleList()); + if (newCaseListSampleIds.addAll(uploadedSampleIds)) { + sampleList.setSampleList(new ArrayList<>(newCaseListSampleIds)); + daoSampleList.updateSampleListList(sampleList); + } } CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(this.cancerStudyStableId); List sampleLists = daoSampleList.getAllSampleLists(cancerStudy.getInternalId()); List remainingLists = sampleLists.stream().filter(sl -> !caseListSampleIdToSampleIds.containsKey(sl.getStableId()) && sl.getSampleList().stream().anyMatch(this.allSampleIds::contains) - ).collect(Collectors.toList()); - for (SampleList remainingList: remainingLists) { + ).toList(); + for (SampleList remainingList : remainingLists) { ArrayList newSampleList = new ArrayList<>(remainingList.getSampleList()); - newSampleList.removeAll(this.allSampleIds); - remainingList.setSampleList(newSampleList); - //TODO for optimization purpose we could supply to the update method 2 set of samples: samples that have to be added and samples that have to be removed - daoSampleList.updateSampleListList(remainingList); + if (newSampleList.removeAll(this.allSampleIds)) { + remainingList.setSampleList(newSampleList); + daoSampleList.updateSampleListList(remainingList); + } } } catch (DaoException e) { throw new RuntimeException(e); @@ -123,40 +131,29 @@ private void updateCaseLists(Map> caseListSampleIdToSampleId private LinkedHashSet readSampleIdsFromDataFile(File dataFile) { LinkedHashSet allSampleIds = new LinkedHashSet<>(); - FileReader reader = null; - try { - reader = new FileReader(dataFile); - try (BufferedReader buff = new BufferedReader(reader)) { - String line; - int sampleIdPosition = -1; - while ((line = buff.readLine()) != null) { - String trimmedLine = line.trim(); - if (trimmedLine.isEmpty() || trimmedLine.startsWith("#")) { - continue; - } + try (FileReader reader = new FileReader(dataFile); + BufferedReader buff = new BufferedReader(reader)) { + String line; + int sampleIdPosition = -1; + while ((line = buff.readLine()) != null) { + String trimmedLine = line.trim(); + if (trimmedLine.isEmpty() || trimmedLine.startsWith("#")) { + continue; + } - String[] fieldValues = line.split("\t"); + String[] fieldValues = line.split("\t"); + if (sampleIdPosition == -1) { + sampleIdPosition = List.of(fieldValues).indexOf("SAMPLE_ID"); if (sampleIdPosition == -1) { - sampleIdPosition = List.of(fieldValues).indexOf("SAMPLE_ID"); - if (sampleIdPosition == -1) { - throw new RuntimeException("No SAMPLE_ID header is found"); - } - } else { - allSampleIds.add(fieldValues[sampleIdPosition].trim()); + throw new RuntimeException("No SAMPLE_ID header is found"); } + } else { + allSampleIds.add(fieldValues[sampleIdPosition].trim()); } - return allSampleIds; } - } catch (Exception e) { + return allSampleIds; + } catch (IOException e) { throw new RuntimeException(e); - } finally { - if (reader != null) { - try { - reader.close(); - } catch (IOException e) { - throw new RuntimeException(e); - } - } } } @@ -178,15 +175,15 @@ private void parseArguments() { String description = "Updates (adds/removes) sample ids in specified case lists."; OptionParser parser = new OptionParser(); - OptionSpec metaOpt = parser.accepts( "meta", - "clinical sample (genetic_alteration_type=CLINICAL and datatype=SAMPLE_ATTRIBUTES or datatype=MIXED_ATTRIBUTES) meta data file. All sample ids found in the file will be added to the _all case list." ).withRequiredArg().required().describedAs( "meta_clinical_sample.txt" ).ofType( String.class ); - OptionSpec caseListDirOrFileOpt = parser.accepts( "case-lists", - "case list file or a directory with case list files" ).withRequiredArg().describedAs( "case_lists/" ).ofType( String.class ); + OptionSpec metaOpt = parser.accepts("meta", + "clinical sample (genetic_alteration_type=CLINICAL and datatype=SAMPLE_ATTRIBUTES or datatype=MIXED_ATTRIBUTES) meta data file. All sample ids found in the file will be added to the _all case list.").withRequiredArg().required().describedAs("meta_clinical_sample.txt").ofType(String.class); + OptionSpec caseListDirOrFileOpt = parser.accepts("case-lists", + "case list file or a directory with case list files").withRequiredArg().describedAs("case_lists/").ofType(String.class); try { - OptionSet options = parser.parse( args ); + OptionSet options = parser.parse(args); this.metaFile = new File(options.valueOf(metaOpt)); - if(options.has(caseListDirOrFileOpt)){ + if (options.has(caseListDirOrFileOpt)) { File caseListDirOrFile = new File(options.valueOf(caseListDirOrFileOpt)); if (caseListDirOrFile.isDirectory()) { this.caseListFiles = Arrays.stream(Objects.requireNonNull(caseListDirOrFile.listFiles())) @@ -207,7 +204,7 @@ private void parseArguments() { /** * Runs the command as a script and exits with an appropriate exit code. * - * @param args the arguments given on the command line + * @param args the arguments given on the command line */ public static void main(String[] args) { ConsoleRunnable runner = new UpdateCaseListsSampleIds(args); From 7b527b6f1120edd2a5d4ec89b47c2077f5c766d9 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 1 May 2024 10:10:05 +0200 Subject: [PATCH 037/130] Make tests fail the build. Conduct exit status of tests correctly --- .github/workflows/validate-python.yml | 4 ++-- README.md | 2 +- test_scripts.sh | 4 +++- 3 files changed, 6 insertions(+), 4 deletions(-) mode change 100644 => 100755 test_scripts.sh diff --git a/.github/workflows/validate-python.yml b/.github/workflows/validate-python.yml index ccf67a61..507c4e6e 100644 --- a/.github/workflows/validate-python.yml +++ b/.github/workflows/validate-python.yml @@ -14,7 +14,7 @@ jobs: - name: 'Validate tests' working-directory: ./cbioportal-core run: | - docker run -v ${PWD}:/cbioportal-core python:3.6 /bin/bash -c ' + docker run -v ${PWD}:/cbioportal-core python:3.6 /bin/sh -c ' cd cbioportal-core && pip install -r requirements.txt && - source test_scripts.sh' + ./test_scripts.sh' diff --git a/README.md b/README.md index 60433f9c..252b7ba9 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ After you are done with the setup, you can build and test the project. 1. Execute tests through the provided script: ```bash -source test_scripts.sh +./test_scripts.sh ``` 2. Build the loader jar using Maven (includes testing): diff --git a/test_scripts.sh b/test_scripts.sh old mode 100644 new mode 100755 index 807f536e..279192a3 --- a/test_scripts.sh +++ b/test_scripts.sh @@ -1 +1,3 @@ -pushd tests/; PYTHONPATH=../scripts:$PYTHONPATH python -m unittest *.py; popd +#!/bin/bash + +pushd tests/ && PYTHONPATH=../scripts:$PYTHONPATH python -m unittest *.py; exit_stat=$?; popd; exit $exit_stat From f5e8217fcab464f785ce4474d5ef56462a9620c3 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 1 May 2024 13:42:07 +0200 Subject: [PATCH 038/130] Write Validation complete only in case of successful validation --- scripts/importer/validateData.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 774597ad..8c77e018 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -5547,6 +5547,7 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ # additional validation between meta files, after all meta files are processed validate_data_relations(validators_by_meta_type, logger) + logger.info('Validation complete') def validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_maf_checks): @@ -5559,6 +5560,7 @@ def validate_data_dir(data_dir, portal_instance, logger, relaxed_mode, strict_ma logger.info("Validating %s", meta_file_type) for validator in validators: validator.validate() + logger.info('Validation complete') def get_pom_path(): @@ -5675,7 +5677,6 @@ def main_validate(args): collapsing_html_handler.flush() html_handler.generateHtml(cbio_version=cbio_version) - logger.info('Validation complete') return exit_status_handler.get_exit_status() From 8d3aaed61cca23f08de3849d67f0a3bd470af352 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 1 May 2024 14:35:20 +0200 Subject: [PATCH 039/130] Add python tests for incremental/full data import --- scripts/importer/cbioportalImporter.py | 14 ++-- tests/system_tests_import_data.py | 93 ++++++++++++++++++++++++++ 2 files changed, 100 insertions(+), 7 deletions(-) create mode 100755 tests/system_tests_import_data.py diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index f059282e..3fa2040b 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -207,7 +207,7 @@ def check_version(jvm_args): raise def process_case_lists(jvm_args, case_list_dir): - for case_list in os.listdir(case_list_dir): + for case_list in sorted(os.listdir(case_list_dir)): # skip "temp"/backup files made by some text editors: if not (case_list.startswith('.') or case_list.endswith('~')): import_case_list(jvm_args, os.path.join(case_list_dir, case_list)) @@ -232,13 +232,13 @@ def process_command(jvm_args, command, meta_filename, data_filename, study_ids, import_case_list(jvm_args, meta_filename) def get_meta_filenames(data_directory): - meta_filenames = ( + meta_filenames = [ os.path.join(data_directory, meta_filename) for meta_filename in os.listdir(data_directory) if re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, flags=re.IGNORECASE) and - not (meta_filename.startswith('.') or meta_filename.endswith('~'))) - return meta_filenames + not (meta_filename.startswith('.') or meta_filename.endswith('~'))] + return sorted(meta_filenames) def process_study_directory(jvm_args, study_directory, update_generic_assay_entity = None): """ @@ -525,7 +525,7 @@ def add_parser_args(parser): parser.add_argument('-data', '--data_filename', type=str, required=False, help='Path to Data file') -def interface(): +def interface(args=None): parent_parser = argparse.ArgumentParser(description='cBioPortal meta Importer') add_parser_args(parent_parser) parser = argparse.ArgumentParser() @@ -555,7 +555,7 @@ def interface(): # TODO - add same argument to metaimporter # TODO - harmonize on - and _ - parser = parser.parse_args() + parser = parser.parse_args(args) if parser.command is not None and parser.subcommand is not None: print('Cannot call multiple commands') sys.exit(2) @@ -637,5 +637,5 @@ def main(args): # ready to roll if __name__ == '__main__': - parsed_args = interface() + parsed_args = interface(args) main(parsed_args) diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py new file mode 100755 index 00000000..a646e0c4 --- /dev/null +++ b/tests/system_tests_import_data.py @@ -0,0 +1,93 @@ +#!/usr/bin/env python3 + +''' +This code is licensed under the GNU Affero General Public License (AGPL), +version 3, or (at your option) any later version. +''' + +import unittest +from unittest import mock +from unittest.mock import call +from importer import cbioportalImporter + +common_part = ('-Dspring.profiles.active=dbcp', '-cp', 'test.jar') + +class DataImporterTests(unittest.TestCase): + ''' + Tests of commands produced by scripts + ''' + + def setUp(self): + self.maxDiff = None + + @mock.patch('importer.cbioportalImporter.locate_jar') + @mock.patch('importer.cbioportalImporter.run_java') + def test_full_study_load(self, run_java, locate_jar): + ''' + Tests java commands full study load produces + ''' + locate_jar.return_value = "test.jar" + + study_directory = 'test_data/study_es_0' + args = ['--study_directory', study_directory] + parsed_args = cbioportalImporter.interface(args) + cbioportalImporter.main(parsed_args) + + self.assertListEqual(run_java.call_args_list, [ + call(*common_part, 'org.mskcc.cbio.portal.util.VersionUtil',), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportTypesOfCancers', f'{study_directory}/data_cancer_type.txt', 'false', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.RemoveCancerStudy', 'study_es_0', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCancerStudy', f'{study_directory}/meta_study.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--meta', f'{study_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_clinical_samples.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceDefinition', '--meta', f'{study_directory}/meta_resource_definition.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_definition.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_sample.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_sample.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--meta', f'{study_directory}/meta_clinical_patients.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_clinical_patients.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData', '--meta', f'{study_directory}/meta_cna_hg19_seg.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_cna_hg19.seg', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_cna_log2.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_expression_median.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_expression_median.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_generic_assay_patient_test.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_generic_assay_patient_test.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGisticData', '--data', f'{study_directory}/data_gistic_genes_amp.txt', '--study', 'study_es_0', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_methylation_hm27.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_mutational_signature.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_mutational_signature.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_mutations_extended.maf', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_patient.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_patient.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_study.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_study.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_treatment_ec50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_treatment_ec50.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_treatment_ic50.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_structural_variants.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_structural_variants.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_cna_discrete.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_expression_median_Zscores.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_expression_median_Zscores.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_gsva_scores.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_gsva_scores.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_gsva_pvalues.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_gsva_pvalues.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap', '--meta', f'{study_directory}/meta_gene_panel_matrix.txt', '--data', f'{study_directory}/data_gene_panel_matrix.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_cna.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_cnaseq.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_custom.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_sequenced.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_test.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.AddCaseList', 'study_es_0', 'all', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCancerStudy', 'study_es_0', 'AVAILABLE', '--noprogress')]) + + @mock.patch('importer.cbioportalImporter.locate_jar') + @mock.patch('importer.cbioportalImporter.run_java') + def test_incremental_load(self, run_java, locate_jar): + ''' + Tests java commands incremental load produces + ''' + locate_jar.return_value = "test.jar" + + data_directory = 'test_data/study_es_0_inc' + args = ['--data_directory', data_directory] + parsed_args = cbioportalImporter.interface(args) + cbioportalImporter.main(parsed_args) + + self.assertListEqual(run_java.call_args_list, [ + call(*common_part, 'org.mskcc.cbio.portal.util.VersionUtil',), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--overwrite-existing', '--meta', f'{data_directory}/meta_clinical_patients.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_patients.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--overwrite-existing', '--meta', f'{data_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_samples.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_mutations_extended.maf', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists')]) + + +if __name__ == '__main__': + unittest.main(buffer=True) From 1b6ba416014a5f91060a336c764efa4f2eccceae Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 1 May 2024 17:30:57 +0200 Subject: [PATCH 040/130] Add unit test for incremental data validation --- tests/unit_tests_validate_data.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/tests/unit_tests_validate_data.py b/tests/unit_tests_validate_data.py index 3048a821..f583f7e6 100755 --- a/tests/unit_tests_validate_data.py +++ b/tests/unit_tests_validate_data.py @@ -3087,5 +3087,13 @@ def test_required_field_permutations(self): self.assertEqual(logging.ERROR, record.levelno) self.assertIn('This line has no value for cbp_driver_tiers and a value for cbp_driver_tiers_annotation. Please, fill the cbp_driver_tiers column.', record.getMessage()) + def test_incremental_data_validation(self): + validateData.validate_data_dir('test_data/study_es_0_inc', + PORTAL_INSTANCE, + self.logger, False, False) + record_list = self.get_log_records() + self.assertEqual('Validation complete', record_list[-1].getMessage()) + + if __name__ == '__main__': unittest.main(buffer=True) From d25200189ae8464b97457687455bd8eafda6341b Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 3 May 2024 17:00:48 +0200 Subject: [PATCH 041/130] Test rough order of importer commands. Remove sorting in the script to guarantee that --- scripts/importer/cbioportalImporter.py | 4 +- tests/system_tests_import_data.py | 83 ++++++++++++++++++-------- 2 files changed, 60 insertions(+), 27 deletions(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index 3fa2040b..ff971f0d 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -207,7 +207,7 @@ def check_version(jvm_args): raise def process_case_lists(jvm_args, case_list_dir): - for case_list in sorted(os.listdir(case_list_dir)): + for case_list in os.listdir(case_list_dir): # skip "temp"/backup files made by some text editors: if not (case_list.startswith('.') or case_list.endswith('~')): import_case_list(jvm_args, os.path.join(case_list_dir, case_list)) @@ -238,7 +238,7 @@ def get_meta_filenames(data_directory): re.search(r'(\b|_)meta(\b|[_0-9])', meta_filename, flags=re.IGNORECASE) and not (meta_filename.startswith('.') or meta_filename.endswith('~'))] - return sorted(meta_filenames) + return meta_filenames def process_study_directory(jvm_args, study_directory, update_generic_assay_entity = None): """ diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index a646e0c4..097e6c01 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -33,32 +33,44 @@ def test_full_study_load(self, run_java, locate_jar): parsed_args = cbioportalImporter.interface(args) cbioportalImporter.main(parsed_args) - self.assertListEqual(run_java.call_args_list, [ + remove_study_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.RemoveCancerStudy', + 'study_es_0', '--noprogress') + create_study_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCancerStudy', + f'{study_directory}/meta_study.txt', '--noprogress') + clinical_sample_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', + '--meta', f'{study_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_clinical_samples.txt', '--noprogress') + make_study_available_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCancerStudy', + 'study_es_0', 'AVAILABLE', '--noprogress') + mol_profile_calls = [ + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_cna_log2.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_expression_median.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_expression_median.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_generic_assay_patient_test.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_generic_assay_patient_test.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_methylation_hm27.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_mutational_signature.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_mutational_signature.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_mutations_extended.maf', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_patient.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_patient.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_study.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_study.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_treatment_ec50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_treatment_ec50.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_treatment_ic50.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_structural_variants.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_structural_variants.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_cna_discrete.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_expression_median_Zscores.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_expression_median_Zscores.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_gsva_scores.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_gsva_scores.txt', '--noprogress'), + call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_gsva_pvalues.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_gsva_pvalues.txt', '--noprogress'), + + ] + self.assertCountEqual(run_java.call_args_list, [ call(*common_part, 'org.mskcc.cbio.portal.util.VersionUtil',), call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportTypesOfCancers', f'{study_directory}/data_cancer_type.txt', 'false', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.RemoveCancerStudy', 'study_es_0', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCancerStudy', f'{study_directory}/meta_study.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--meta', f'{study_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_clinical_samples.txt', '--noprogress'), + remove_study_call, + create_study_call, + clinical_sample_call, call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceDefinition', '--meta', f'{study_directory}/meta_resource_definition.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_definition.txt', '--noprogress'), call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_sample.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_sample.txt', '--noprogress'), call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--meta', f'{study_directory}/meta_clinical_patients.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_clinical_patients.txt', '--noprogress'), + *mol_profile_calls, call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData', '--meta', f'{study_directory}/meta_cna_hg19_seg.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_cna_hg19.seg', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_cna_log2.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_expression_median.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_expression_median.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_generic_assay_patient_test.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_generic_assay_patient_test.txt', '--noprogress'), call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGisticData', '--data', f'{study_directory}/data_gistic_genes_amp.txt', '--study', 'study_es_0', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_methylation_hm27.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_mutational_signature.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_mutational_signature.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_mutations_extended.maf', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_patient.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_patient.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportResourceData', '--meta', f'{study_directory}/meta_resource_study.txt', '--loadMode', 'bulkload', '--data', f'{study_directory}/data_resource_study.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_treatment_ec50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_treatment_ec50.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_treatment_ic50.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_structural_variants.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_structural_variants.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_cna_discrete.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_expression_median_Zscores.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_expression_median_Zscores.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_gsva_scores.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_gsva_scores.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--meta', f'{study_directory}/meta_gsva_pvalues.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{study_directory}/data_gsva_pvalues.txt', '--noprogress'), call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap', '--meta', f'{study_directory}/meta_gene_panel_matrix.txt', '--data', f'{study_directory}/data_gene_panel_matrix.txt', '--noprogress'), call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_cna.txt', '--noprogress'), call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_cnaseq.txt', '--noprogress'), @@ -66,7 +78,15 @@ def test_full_study_load(self, run_java, locate_jar): call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_sequenced.txt', '--noprogress'), call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportSampleList', f'{study_directory}/case_lists/cases_test.txt', '--noprogress'), call(*common_part, 'org.mskcc.cbio.portal.scripts.AddCaseList', 'study_es_0', 'all', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCancerStudy', 'study_es_0', 'AVAILABLE', '--noprogress')]) + make_study_available_call, + ]) + + self.assertTrue(run_java.call_args_list.index(remove_study_call) < run_java.call_args_list.index(create_study_call)) + self.assertTrue(run_java.call_args_list.index(create_study_call) < run_java.call_args_list.index(clinical_sample_call)) + self.assertTrue(all(run_java.call_args_list.index(clinical_sample_call) < run_java.call_args_list.index(mol_profile_call) + for mol_profile_call in mol_profile_calls)) + self.assertEqual(run_java.call_args_list[-1], make_study_available_call) + @mock.patch('importer.cbioportalImporter.locate_jar') @mock.patch('importer.cbioportalImporter.run_java') @@ -81,12 +101,25 @@ def test_incremental_load(self, run_java, locate_jar): parsed_args = cbioportalImporter.interface(args) cbioportalImporter.main(parsed_args) - self.assertListEqual(run_java.call_args_list, [ + clinical_patient_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_clinical_patients.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_patients.txt', '--noprogress') + clinical_sample_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_samples.txt', '--noprogress') + mutation_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_mutations_extended.maf', '--noprogress') + case_list_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', + '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists') + + self.assertCountEqual(run_java.call_args_list, [ call(*common_part, 'org.mskcc.cbio.portal.util.VersionUtil',), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--overwrite-existing', '--meta', f'{data_directory}/meta_clinical_patients.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_patients.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportClinicalData', '--overwrite-existing', '--meta', f'{data_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_samples.txt', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_mutations_extended.maf', '--noprogress'), - call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists')]) + clinical_patient_call, + clinical_sample_call, + mutation_call, + case_list_call, + ]) + + self.assertTrue(run_java.call_args_list.index(clinical_sample_call) < run_java.call_args_list.index(mutation_call)) + self.assertTrue(run_java.call_args_list.index(clinical_sample_call) < run_java.call_args_list.index(case_list_call)) if __name__ == '__main__': From c27b8f16051431d384c8cf35aa3a223200b55867 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 3 May 2024 17:30:41 +0200 Subject: [PATCH 042/130] Extract smaller functions from the big one in py script Make process_data_directory(...) smaller --- scripts/importer/cbioportalImporter.py | 38 ++++++++++++++++++-------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index ff971f0d..f52bbc6a 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -12,6 +12,7 @@ import logging import re from pathlib import Path +from typing import Dict, Tuple # configure relative imports if running as a script; see PEP 366 # it might passed as empty string by certain tooling to mark a top level module @@ -431,16 +432,10 @@ def process_study_directory(jvm_args, study_directory, update_generic_assay_enti # enable study update_study_status(jvm_args, study_id) - -def process_data_directory(jvm_args, data_directory, update_generic_assay_entity = None): +def get_meta_filenames_by_type(data_directory) -> Dict[str, Tuple[str, Dict]]: """ - Incremental import of data directory based on meta files found. - - 1. Determine meta files in directory. - 2. Read all meta files and determine file types. - 3. Import data files in specific order by file type with the incremental flag. + Read all meta files in the data directory and return meta information (filename, content) grouped by type. """ - meta_file_type_to_meta_files = {} # Determine meta filenames in study directory @@ -462,11 +457,12 @@ def process_data_directory(jvm_args, data_directory, update_generic_assay_entity meta_file_type_to_meta_files[meta_file_type] = [] meta_file_type_to_meta_files[meta_file_type].append((meta_filename, meta_dictionary)) + return meta_file_type_to_meta_files - - not_supported_meta_types = meta_file_type_to_meta_files.keys() - INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES - if not_supported_meta_types: - raise NotImplementedError("These types do not support incremental upload: {}".format(", ".join(not_supported_meta_types))) +def import_incremental_data(jvm_args, data_directory, update_generic_assay_entity, meta_file_type_to_meta_files): + """ + Load all data types that are available and support incremental upload + """ for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: meta_pairs = meta_file_type_to_meta_files[meta_file_type] for meta_pair in meta_pairs: @@ -474,6 +470,12 @@ def process_data_directory(jvm_args, data_directory, update_generic_assay_entity data_filename = os.path.join(data_directory, meta_dictionary['data_filename']) import_data(jvm_args, meta_filename, data_filename, update_generic_assay_entity, meta_dictionary, incremental=True) +def update_case_lists_from_folder(jvm_args, data_directory, meta_file_type_to_meta_files): + """ + Updates case lists if clinical sample provided. + The command takes case_list/ folder as optional argument. + If folder exists case lists will be updated accordingly. + """ if MetaFileTypes.SAMPLE_ATTRIBUTES in meta_file_type_to_meta_files: case_list_dirname = os.path.join(data_directory, 'case_lists') sample_attributes_metas = meta_file_type_to_meta_files[MetaFileTypes.SAMPLE_ATTRIBUTES] @@ -482,6 +484,18 @@ def process_data_directory(jvm_args, data_directory, update_generic_assay_entity LOGGER.info('Updating case lists with sample ids', extra={'filename_': meta_filename}) update_case_lists(jvm_args, meta_filename, case_lists_file_or_dir=case_list_dirname if os.path.isdir(case_list_dirname) else None) +def process_data_directory(jvm_args, data_directory, update_generic_assay_entity = None): + """ + Incremental import of data directory based on meta files found. + """ + + meta_file_type_to_meta_files = get_meta_filenames_by_type(data_directory) + + not_supported_meta_types = meta_file_type_to_meta_files.keys() - INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES + if not_supported_meta_types: + raise NotImplementedError("These types do not support incremental upload: {}".format(", ".join(not_supported_meta_types))) + import_incremental_data(jvm_args, data_directory, update_generic_assay_entity, meta_file_type_to_meta_files) + update_case_lists_from_folder(jvm_args, data_directory, meta_file_type_to_meta_files) def usage(): # TODO : replace this by usage string from interface() From b2c1c21665496f77fe91addb6521e8eb674438e2 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 2 May 2024 12:02:13 +0200 Subject: [PATCH 043/130] Refactor tab delim. data importer - Calculate number of lines in the file in the loader - Remove unused imports and fields - Reuse constructors - Reuse common parsing logic in tab delimiter importer - Show full stacktrace which helps in dinding where tests errored out --- pom.xml | 3 + .../cbio/portal/dao/DaoGeneticAlteration.java | 4 +- .../portal/scripts/ImportProfileData.java | 19 +- .../portal/scripts/ImportResourceData.java | 1 - .../portal/scripts/ImportTabDelimData.java | 594 ++++++++---------- .../org/mskcc/cbio/portal/util/FileUtil.java | 60 +- .../scripts/TestImportTabDelimData.java | 29 +- 7 files changed, 299 insertions(+), 411 deletions(-) diff --git a/pom.xml b/pom.xml index c71f78a2..e858e319 100644 --- a/pom.xml +++ b/pom.xml @@ -252,6 +252,9 @@ org.apache.maven.plugins maven-surefire-plugin 2.21.0 + + false + default-test diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index 25bef125..25eca11c 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -65,12 +65,10 @@ private DaoGeneticAlteration() { * Gets Instance of Dao Object. (Singleton pattern). * * @return DaoGeneticAlteration Object. - * @throws DaoException Dao Initialization Error. */ - public static DaoGeneticAlteration getInstance() throws DaoException { + public static DaoGeneticAlteration getInstance() { if (daoGeneticAlteration == null) { daoGeneticAlteration = new DaoGeneticAlteration(); - } return daoGeneticAlteration; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index a0ffe297..e4b11844 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -54,12 +54,8 @@ public class ImportProfileData extends ConsoleRunnable { public void run() { DaoGeneOptimized daoGene; DaoGeneticAlteration daoGeneticAlteration; - try { - daoGene = DaoGeneOptimized.getInstance(); - daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - } catch (DaoException e) { - throw new RuntimeException("Could not create dao instances", e); - } + daoGene = DaoGeneOptimized.getInstance(); + daoGeneticAlteration = DaoGeneticAlteration.getInstance(); try { // Parse arguments @@ -92,8 +88,7 @@ public void run() { " --> profile id: " + geneticProfile.getGeneticProfileId() + "\n --> profile name: " + geneticProfile.getProfileName() + "\n --> genetic alteration type: " + geneticProfile.getGeneticAlterationType().name()); - ProgressMonitor.setMaxValue(numLines); - + // Check genetic alteration type if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.MUTATION_EXTENDED || geneticProfile.getGeneticAlterationType() == GeneticAlterationType.MUTATION_UNCALLED) { @@ -132,9 +127,9 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), - daoGeneticAlteration + daoGeneticAlteration, daoGene ); - genericAssayProfileImporter.importData(numLines); + genericAssayProfileImporter.importData(); } } else if( geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION @@ -156,13 +151,13 @@ public void run() { geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, - daoGeneticAlteration + daoGeneticAlteration, daoGene ); String pdAnnotationsFilename = geneticProfile.getOtherMetaDataField("pd_annotations_filename"); if (pdAnnotationsFilename != null && !"".equals(pdAnnotationsFilename)) { importer.setPdAnnotationsFile(new File(dataFile.getParent(), pdAnnotationsFilename)); } - importer.importData(numLines); + importer.importData(); } } catch (Exception e) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java index d04124ba..147d59d9 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java @@ -7,7 +7,6 @@ import java.io.*; import joptsimple.*; import java.util.*; -import java.util.regex.*; import java.util.stream.Collectors; import org.apache.commons.collections4.map.MultiKeyMap; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index b984abf4..81300e63 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -53,11 +53,8 @@ */ public class ImportTabDelimData { public static final String CNA_VALUE_AMPLIFICATION = "2"; - public static final String CNA_VALUE_GAIN = "1"; - public static final String CNA_VALUE_HEMIZYGOUS_DELETION = "-1"; public static final String CNA_VALUE_HOMOZYGOUS_DELETION = "-2"; public static final String CNA_VALUE_PARTIAL_DELETION = "-1.5"; - public static final String CNA_VALUE_ZERO = "0"; private HashSet importedGeneticEntitySet = new HashSet<>(); private File dataFile; private String targetLine; @@ -72,6 +69,11 @@ public class ImportTabDelimData { private Map, Map> pdAnnotations; private final GeneticAlterationImporter geneticAlterationImporter; + private int numLines; + private DaoGeneticAlteration daoGeneticAlteration; + + private DaoGeneOptimized daoGene; + /** * Constructor. * @@ -90,17 +92,11 @@ public ImportTabDelimData( int geneticProfileId, String genePanel, String genericEntityProperties, - DaoGeneticAlteration daoGeneticAlteration + DaoGeneticAlteration daoGeneticAlteration, + DaoGeneOptimized daoGene ) { - this.dataFile = dataFile; - this.targetLine = targetLine; - this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; - this.genericEntityProperties = genericEntityProperties; - this.geneticAlterationImporter = new GeneticAlterationImporter( - geneticProfileId, - daoGeneticAlteration - ); + this(dataFile, targetLine, geneticProfileId, genePanel, daoGeneticAlteration, daoGene); + this.genericEntityProperties = genericEntityProperties; } /** @@ -118,13 +114,11 @@ public ImportTabDelimData( String targetLine, int geneticProfileId, String genePanel, - DaoGeneticAlteration daoGeneticAlteration + DaoGeneticAlteration daoGeneticAlteration, + DaoGeneOptimized daoGene ) { - this.dataFile = dataFile; + this(dataFile, geneticProfileId, genePanel, daoGeneticAlteration, daoGene); this.targetLine = targetLine; - this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; - this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); } /** @@ -137,12 +131,15 @@ public ImportTabDelimData( File dataFile, int geneticProfileId, String genePanel, - DaoGeneticAlteration daoGeneticAlteration + DaoGeneticAlteration daoGeneticAlteration, + DaoGeneOptimized daoGene ) { this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; this.genePanel = genePanel; + this.daoGeneticAlteration = daoGeneticAlteration; this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); + this.daoGene = daoGene; } /** @@ -151,14 +148,20 @@ public ImportTabDelimData( * @throws IOException IO Error. * @throws DaoException Database Error. */ - public void importData(int numLines) throws IOException, DaoException { + public void importData() throws IOException, DaoException { + try { + this.numLines = FileUtil.getNumLines(dataFile); + } catch (IOException e) { + throw new RuntimeException(e); + } + ProgressMonitor.setMaxValue(numLines); geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); String headerLine = buf.readLine(); - String parts[] = headerLine.split("\t"); + String headerParts[] = headerLine.split("\t"); //Whether data regards CNA or RPPA: boolean isDiscretizedCnaProfile = geneticProfile != null @@ -166,23 +169,23 @@ public void importData(int numLines) throws IOException, DaoException { && geneticProfile.showProfileInAnalysisTab(); boolean isRppaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.PROTEIN_LEVEL - && "Composite.Element.Ref".equalsIgnoreCase(parts[0]); + && "Composite.Element.Ref".equalsIgnoreCase(headerParts[0]); boolean isGsvaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENESET_SCORE - && parts[0].equalsIgnoreCase("geneset_id"); + && headerParts[0].equalsIgnoreCase("geneset_id"); boolean isGenericAssayProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY - && parts[0].equalsIgnoreCase("ENTITY_STABLE_ID"); + && headerParts[0].equalsIgnoreCase("ENTITY_STABLE_ID"); int numRecordsToAdd = 0; int samplesSkipped = 0; try { - int hugoSymbolIndex = getHugoSymbolIndex(parts); - int entrezGeneIdIndex = getEntrezGeneIdIndex(parts); - int rppaGeneRefIndex = getRppaGeneRefIndex(parts); - int genesetIdIndex = getGenesetIdIndex(parts); - int sampleStartIndex = getStartIndex(parts, hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, genesetIdIndex); - int genericAssayIdIndex = getGenericAssayIdIndex(parts); + int hugoSymbolIndex = getHugoSymbolIndex(headerParts); + int entrezGeneIdIndex = getEntrezGeneIdIndex(headerParts); + int rppaGeneRefIndex = getRppaGeneRefIndex(headerParts); + int genesetIdIndex = getGenesetIdIndex(headerParts); + int sampleStartIndex = getStartIndex(headerParts, hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, genesetIdIndex); + int genericAssayIdIndex = getGenericAssayIdIndex(headerParts); if (isRppaProfile) { if (rppaGeneRefIndex == -1) { throw new RuntimeException("Error: the following column should be present for RPPA data: Composite.Element.Ref"); @@ -200,8 +203,8 @@ public void importData(int numLines) throws IOException, DaoException { } String sampleIds[]; - sampleIds = new String[parts.length - sampleStartIndex]; - System.arraycopy(parts, sampleStartIndex, sampleIds, 0, parts.length - sampleStartIndex); + sampleIds = new String[headerParts.length - sampleStartIndex]; + System.arraycopy(headerParts, sampleStartIndex, sampleIds, 0, headerParts.length - sampleStartIndex); int nrUnknownSamplesAdded = 0; ProgressMonitor.setCurrentMessage(" --> total number of samples: " + sampleIds.length); @@ -258,12 +261,6 @@ public void importData(int numLines) throws IOException, DaoException { DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); - //Gene cache: - DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); - - //Object to insert records in the generic 'genetic_alteration' table: - DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); if (isDiscretizedCnaProfile) { @@ -277,28 +274,70 @@ public void importData(int numLines) throws IOException, DaoException { genericAssayStableIdToEntityIdMap = GenericAssayMetaUtils.buildGenericAssayStableIdToEntityIdMap(); } - int lenParts = parts.length; + int headerColumns = headerParts.length; String line = buf.readLine(); while (line != null) { + ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); boolean recordAdded = false; - // either parse line as geneset or gene for importing into 'genetic_alteration' table - if (isGsvaProfile) { - recordAdded = parseGenesetLine(line, lenParts, sampleStartIndex, genesetIdIndex, - filteredSampleIndices, daoGeneticAlteration); - } else if (isGenericAssayProfile) { - recordAdded = parseGenericAssayLine(line, lenParts, sampleStartIndex, genericAssayIdIndex, - filteredSampleIndices, daoGeneticAlteration, genericAssayStableIdToEntityIdMap); - } else { - recordAdded = parseLine(line, lenParts, sampleStartIndex, - hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, - isRppaProfile, isDiscretizedCnaProfile, - daoGene, - filteredSampleIndices, orderedSampleList, - existingCnaEvents); + if (FileUtil.isInfoLine(line)) { + String[] rowParts = line.split("\t", -1); + + if (rowParts.length > headerColumns && line.split("\t").length > headerColumns) { + ProgressMonitor.logWarning("Ignoring line with more fields (" + rowParts.length + + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); + + } else { + String sampleValues[] = ArrayUtils.subarray(rowParts, sampleStartIndex, rowParts.length > headerColumns ? headerColumns : rowParts.length); + + // trim whitespace from values + sampleValues = Stream.of(sampleValues).map(String::trim).toArray(String[]::new); + sampleValues = filterOutNormalValues(filteredSampleIndices, sampleValues); + + // either parse line as geneset or gene for importing into 'genetic_alteration' table + if (isGsvaProfile) { + String genesetId = rowParts[genesetIdIndex]; + recordAdded = saveGenesetLine(sampleValues, genesetId); + } else if (isGenericAssayProfile) { + String genericAssayId = rowParts[genericAssayIdIndex]; + recordAdded = saveGenericAssayLine(sampleValues, genericAssayId, genericAssayStableIdToEntityIdMap); + } else { + String geneSymbol = null; + if (hugoSymbolIndex != -1) { + geneSymbol = rowParts[hugoSymbolIndex]; + } + if (rppaGeneRefIndex != -1) { + geneSymbol = rowParts[rppaGeneRefIndex]; + } + if (geneSymbol != null && geneSymbol.isEmpty()) { + geneSymbol = null; + } + //get entrez + String entrez = null; + if (entrezGeneIdIndex != -1) { + entrez = rowParts[entrezGeneIdIndex]; + } + if (entrez != null && entrez.isEmpty()) { + entrez = null; + } + if (entrez != null && !entrez.matches("[0-9]+")) { + //TODO - would be better to give an exception in some cases, like negative Entrez values + ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); + } else { + String firstCellValue = rowParts[0]; + if (targetLine == null || firstCellValue.equals(targetLine)) { + recordAdded = saveLine(sampleValues, + entrez, geneSymbol, + isRppaProfile, isDiscretizedCnaProfile, orderedSampleList, + existingCnaEvents); + } + } + } + } + } // increment number of records added or entries skipped @@ -483,331 +522,226 @@ private Map, Map> readPdAnnotations(File * AMIXED0... * * - * @param line the line from the profile data file to be parsed - * @param nrColumns the number of columns, defined by the header line - * @param sampleStartIndex the index of the first column with a sample name in the header field - * @param hugoSymbolIndex the index of the column Hugo_Symbol - * @param entrezGeneIdIndex the index of the column Entrez_Gene_Id - * @param rppaGeneRefIndex the index of the column Composite.Element.Ref * @param isRppaProfile true if this is an rppa profile (i.e. alteration type is PROTEIN_LEVEL and the first column is Composite.Element.Ref) * @param isDiscretizedCnaProfile true if this is a discretized CNA profile (i.e. alteration type COPY_NUMBER_ALTERATION and showProfileInAnalysisTab is true) - * @param daoGene an instance of DaoGeneOptimized ... for use in resolving gene symbols * @param orderedSampleList a list of the internal sample ids corresponding to the sample names in the header line * @param existingCnaEvents a collection of CnaEvents, to be added to or updated during parsing of individual lines * @return true if any record was stored in genetic_alteration, else false * @throws DaoException if any DaoException is thrown while using daoGene or daoGeneticAlteration */ - private boolean parseLine(String line, int nrColumns, int sampleStartIndex, - int hugoSymbolIndex, int entrezGeneIdIndex, int rppaGeneRefIndex, - boolean isRppaProfile, boolean isDiscretizedCnaProfile, - DaoGeneOptimized daoGene, - List filteredSampleIndices, List orderedSampleList, - Set existingCnaEvents + private boolean saveLine(String[] values, + String entrez, + String geneSymbol, + boolean isRppaProfile, + boolean isDiscretizedCnaProfile, + List orderedSampleList, + Set existingCnaEvents ) throws DaoException { - //TODO: refactor this entire function - split functionality into smaller units / subroutines - boolean recordStored = false; - // Ignore lines starting with # - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); + if (isRppaProfile && geneSymbol == null) { + ProgressMonitor.logWarning("Ignoring line with no Composite.Element.REF value"); + return false; + } - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - values = filterOutNormalValues(filteredSampleIndices, values); + //If all are empty, skip line: + boolean noGeneSpecified = geneSymbol == null && entrez == null; + if (noGeneSpecified) { + ProgressMonitor.logWarning("Ignoring line with no Hugo_Symbol and no Entrez_Id"); + return false; + } - String geneSymbol = null; - if (hugoSymbolIndex != -1) { - geneSymbol = parts[hugoSymbolIndex]; - } - //RPPA: //TODO - we should split up the RPPA scenario from this code...too many if/else because of this - if (rppaGeneRefIndex != -1) { - geneSymbol = parts[rppaGeneRefIndex]; - } - if (geneSymbol != null && geneSymbol.isEmpty()) { - geneSymbol = null; + if (geneSymbol != null) { + boolean multipleGenesLine = geneSymbol.contains("///"); + if (multipleGenesLine) { + ProgressMonitor.logWarning("Ignoring gene symbol: " + geneSymbol + + " It is separated by ///. This indicates that the line contains information regarding multiple genes, and we cannot currently handle this"); + return false; } - if (isRppaProfile && geneSymbol == null) { - ProgressMonitor.logWarning("Ignoring line with no Composite.Element.REF value"); + boolean unknownGene = geneSymbol.contains("---"); + if (unknownGene) { + ProgressMonitor.logWarning("Ignoring gene symbol: " + geneSymbol + + " It is specified as ---. This indicates that the line contains information regarding an unknown gene, and we cannot currently handle this"); return false; } - //get entrez - String entrez = null; - if (entrezGeneIdIndex != -1) { - entrez = parts[entrezGeneIdIndex]; + } + + List genes; + //If rppa, parse genes from "Composite.Element.REF" column: + if (isRppaProfile) { + genes = parseRPPAGenes(geneSymbol); + } else { + genes = parseGenes(entrez, geneSymbol); + } + + //if genes still null, skip current record + if (genes == null || genes.isEmpty()) { + ProgressMonitor.logWarning("Gene with Entrez_Id " + entrez + " and gene symbol" + geneSymbol +" not found. Record will be skipped for this gene."); + return false; + } + + List genesMatchingAnAlias = Collections.emptyList(); + if (geneSymbol != null) { + genesMatchingAnAlias = daoGene.getGenesForAlias(geneSymbol); + } + + Set microRNAGenes = new HashSet<>(); + Set nonMicroRNAGenes = new HashSet<>(); + Iterator geneIterator = Stream.concat(genes.stream(), genesMatchingAnAlias.stream()).iterator(); + while (geneIterator.hasNext()) { + CanonicalGene g = geneIterator.next(); + if ("miRNA".equals(g.getType())) { + microRNAGenes.add(g); + } else { + nonMicroRNAGenes.add(g); } - if (entrez != null) { - if (entrez.isEmpty()) { - entrez = null; - } - else if (!entrez.matches("[0-9]+")) { - //TODO - would be better to give an exception in some cases, like negative Entrez values - ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); - return false; + } + if (!microRNAGenes.isEmpty()) { + // for micro rna, duplicate the data + for (CanonicalGene gene : microRNAGenes) { + if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + recordStored = true; } } - - //If all are empty, skip line: - if (geneSymbol == null && entrez == null) { - ProgressMonitor.logWarning("Ignoring line with no Hugo_Symbol and no Entrez_Id"); + if (!recordStored) { + if (nonMicroRNAGenes.isEmpty()) { + // this means that no microRNA records could not be stored + ProgressMonitor.logWarning("Could not store microRNA data"); + } else { + // this case : + // - at least one of the entrez-gene-ids was not a microRNA + // - all of the matched microRNA ids (if any) failed to be imported (presumably already imported on a prior line) + ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous (a mixture of microRNA and other types). Record will be skipped for this gene."); + } return false; + } + } else { + // none of the matched genes are type "miRNA" + if (genes.size() == 1) { + // Store all values per gene: + recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); + //only add extra CNA related records if the step above worked, otherwise skip: + if (recordStored && isDiscretizedCnaProfile) { + CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, orderedSampleList, genes)); + } } else { - if (geneSymbol != null && (geneSymbol.contains("///") || geneSymbol.contains("---"))) { - // Ignore gene IDs separated by ///. This indicates that - // the line contains information regarding multiple genes, and - // we cannot currently handle this. - // Also, ignore gene IDs that are specified as ---. This indicates - // the line contains information regarding an unknown gene, and - // we cannot currently handle this. - ProgressMonitor.logWarning("Ignoring gene ID: " + geneSymbol); - return false; + if (isRppaProfile) { // for protein data, duplicate the data + recordStored = saveRppaValues(values, geneSymbol, recordStored, genes); } else { - List genes = null; - //If rppa, parse genes from "Composite.Element.REF" column: - if (isRppaProfile) { - genes = parseRPPAGenes(geneSymbol); - if (genes == null) { - //will be null when there is a parse error in this case, so we - //can return here and avoid duplicated messages: - return false; - } - if (genes.isEmpty()) { - String gene = (geneSymbol != null) ? geneSymbol : entrez; - ProgressMonitor.logWarning("Gene not found for: [" + gene - + "]. Ignoring it " - + "and all tab-delimited data associated with it!"); - return false; - } - } else { - //try entrez: - if (entrez != null) { - CanonicalGene gene = daoGene.getGene(Long.parseLong(entrez)); - if (gene != null) { - genes = Arrays.asList(gene); - } - } - //no entrez or could not resolve by entrez, try hugo: - if ((genes == null || genes.isEmpty()) && geneSymbol != null) { - // deal with multiple symbols separate by |, use the first one - int ix = geneSymbol.indexOf("|"); - if (ix > 0) { - geneSymbol = geneSymbol.substring(0, ix); - } - genes = daoGene.getGene(geneSymbol, true); - } - //if genes still null, skip current record - if (genes == null || genes.isEmpty()) { - ProgressMonitor.logWarning("Entrez_Id " + entrez + " not found. Record will be skipped for this gene."); - return false; - } + if (!recordStored) { + // this case : + // - the hugo gene symbol was ambiguous (matched multiple entrez-gene-ids) + ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous. Record will be skipped for this gene."); } + } + } + } + return recordStored; + } - // If targetLine is specified and does not match the current line, skip the current line. - if (targetLine != null && !(parts[0].equals(targetLine))) { - return false; - } + private boolean saveRppaValues(String[] values, String geneSymbol, boolean recordStored, List genes) throws DaoException { + for (CanonicalGene gene : genes) { + if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + recordStored = true; + nrExtraRecords++; + } + } + if (recordStored) { + //skip one, to avoid double counting: + nrExtraRecords--; + } else { + // this means that RPPA could not be stored + ProgressMonitor.logWarning("Could not store RPPA data"); + } + return recordStored; + } - List genesMatchingAnAlias = Collections.emptyList(); - if (geneSymbol != null) { - genesMatchingAnAlias = daoGene.getGenesForAlias(geneSymbol); - } + private List parseGenes(String entrez, String geneSymbol) { + //try entrez: + if (entrez != null) { + CanonicalGene gene = daoGene.getGene(Long.parseLong(entrez)); + if (gene != null) { + return Arrays.asList(gene); + } + } + //no entrez or could not resolve by entrez, try hugo: + if (geneSymbol != null) { + // deal with multiple symbols separate by |, use the first one + int ix = geneSymbol.indexOf("|"); + if (ix > 0) { + geneSymbol = geneSymbol.substring(0, ix); + } + return daoGene.getGene(geneSymbol, true); + } + return List.of(); + } - Set microRNAGenes = new HashSet<>(); - Set nonMicroRNAGenes = new HashSet<>(); - Iterator geneIterator = Stream.concat(genes.stream(), genesMatchingAnAlias.stream()).iterator(); - while (geneIterator.hasNext()) { - CanonicalGene g = geneIterator.next(); - if ("miRNA".equals(g.getType())) { - microRNAGenes.add(g); - } else { - nonMicroRNAGenes.add(g); - } - } - if (!microRNAGenes.isEmpty()) { - // for micro rna, duplicate the data - for (CanonicalGene gene : microRNAGenes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { - recordStored = true; - } - } - if (!recordStored) { - if (nonMicroRNAGenes.isEmpty()) { - // this means that no microRNA records could not be stored - ProgressMonitor.logWarning("Could not store microRNA data"); - } else { - // this case : - // - at least one of the entrez-gene-ids was not a microRNA - // - all of the matched microRNA ids (if any) failed to be imported (presumably already imported on a prior line) - ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous (a mixture of microRNA and other types). Record will be skipped for this gene."); - } - return false; - } - } else { - // none of the matched genes are type "miRNA" - if (genes.size() == 1) { - List cnaEventsToAdd = new ArrayList(); - - if (isDiscretizedCnaProfile) { - long entrezGeneId = genes.get(0).getEntrezGeneId(); - for (int i = 0; i < values.length; i++) { - - // temporary solution -- change partial deletion back to full deletion. - if (values[i].equals(CNA_VALUE_PARTIAL_DELETION)) { - values[i] = CNA_VALUE_HOMOZYGOUS_DELETION; - } - if (values[i].equals(CNA_VALUE_AMPLIFICATION) - // || values[i].equals(CNA_VALUE_GAIN) >> skipping GAIN, ZERO, HEMIZYGOUS_DELETION to minimize size of dataset in DB - // || values[i].equals(CNA_VALUE_ZERO) - // || values[i].equals(CNA_VALUE_HEMIZYGOUS_DELETION) - || values[i].equals(CNA_VALUE_HOMOZYGOUS_DELETION) - ) { - Integer sampleId = orderedSampleList.get(i); - CnaEvent cnaEvent = new CnaEvent(sampleId, geneticProfileId, entrezGeneId, Short.parseShort(values[i])); - //delayed add: - AbstractMap.SimpleEntry sampleGenePair = new AbstractMap.SimpleEntry<>(sampleId, entrezGeneId); - Map pdAnnotationDetails = this.pdAnnotations.get(sampleGenePair); - if (pdAnnotationDetails != null) { - cnaEvent.setDriverFilter(pdAnnotationDetails.get("DRIVER_FILTER")); - cnaEvent.setDriverFilterAnnotation(pdAnnotationDetails.get("DRIVER_FILTER_ANNOTATION")); - cnaEvent.setDriverTiersFilter(pdAnnotationDetails.get("DRIVER_TIERS_FILTER")); - cnaEvent.setDriverTiersFilterAnnotation(pdAnnotationDetails.get("DRIVER_TIERS_FILTER_ANNOTATION")); - } - cnaEventsToAdd.add(cnaEvent); - } - } - } - // Store all values per gene: - recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); - //only add extra CNA related records if the step above worked, otherwise skip: - if (recordStored) { - CnaUtil.storeCnaEvents(existingCnaEvents, cnaEventsToAdd); - } - } else { - if (isRppaProfile) { // for protein data, duplicate the data - for (CanonicalGene gene : genes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { - recordStored = true; - nrExtraRecords++; - } - } - if (recordStored) { - //skip one, to avoid double counting: - nrExtraRecords--; - } else { - // this means that RPPA could not be stored - ProgressMonitor.logWarning("Could not store RPPA data"); - } - } else { - if (!recordStored) { - // this case : - // - the hugo gene symbol was ambiguous (matched multiple entrez-gene-ids) - ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous. Record will be skipped for this gene."); - } - } - } - } + private List composeCnaEventsToAdd(String[] values, List orderedSampleList, List genes) { + List cnaEventsToAdd = new ArrayList(); + long entrezGeneId = genes.get(0).getEntrezGeneId(); + for (int i = 0; i < values.length; i++) { + + // temporary solution -- change partial deletion back to full deletion. + if (values[i].equals(CNA_VALUE_PARTIAL_DELETION)) { + values[i] = CNA_VALUE_HOMOZYGOUS_DELETION; + } + if (values[i].equals(CNA_VALUE_AMPLIFICATION) + // || values[i].equals(CNA_VALUE_GAIN) >> skipping GAIN, ZERO, HEMIZYGOUS_DELETION to minimize size of dataset in DB + // || values[i].equals(CNA_VALUE_ZERO) + // || values[i].equals(CNA_VALUE_HEMIZYGOUS_DELETION) + || values[i].equals(CNA_VALUE_HOMOZYGOUS_DELETION) + ) { + Integer sampleId = orderedSampleList.get(i); + CnaEvent cnaEvent = new CnaEvent(sampleId, geneticProfileId, entrezGeneId, Short.parseShort(values[i])); + //delayed add: + AbstractMap.SimpleEntry sampleGenePair = new AbstractMap.SimpleEntry<>(sampleId, entrezGeneId); + Map pdAnnotationDetails = this.pdAnnotations.get(sampleGenePair); + if (pdAnnotationDetails != null) { + cnaEvent.setDriverFilter(pdAnnotationDetails.get("DRIVER_FILTER")); + cnaEvent.setDriverFilterAnnotation(pdAnnotationDetails.get("DRIVER_FILTER_ANNOTATION")); + cnaEvent.setDriverTiersFilter(pdAnnotationDetails.get("DRIVER_TIERS_FILTER")); + cnaEvent.setDriverTiersFilterAnnotation(pdAnnotationDetails.get("DRIVER_TIERS_FILTER_ANNOTATION")); } + cnaEventsToAdd.add(cnaEvent); } } - return recordStored; + return cnaEventsToAdd; } /** * Parses line for gene set record and stores record in 'genetic_alteration' table. - * @param line - * @param nrColumns - * @param sampleStartIndex - * @param genesetIdIndex - * @param filteredSampleIndices - * @param daoGeneticAlteration + * @param genesetId * @return * @throws DaoException */ - private boolean parseGenesetLine(String line, int nrColumns, int sampleStartIndex, int genesetIdIndex, - List filteredSampleIndices, DaoGeneticAlteration daoGeneticAlteration) throws DaoException { + private boolean saveGenesetLine(String[] values, String genesetId) throws DaoException { boolean storedRecord = false; - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); - - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } - - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - - // trim whitespace from values - values = Stream.of(values).map(String::trim).toArray(String[]::new); - values = filterOutNormalValues(filteredSampleIndices, values); - Geneset geneset = DaoGeneset.getGenesetByExternalId(parts[genesetIdIndex]); - if (geneset != null) { - storedRecord = storeGeneticEntityGeneticAlterations(values, daoGeneticAlteration, geneset.getGeneticEntityId(), - EntityType.GENESET, geneset.getExternalId()); - } - else { - ProgressMonitor.logWarning("Geneset " + parts[genesetIdIndex] + " not found in DB. Record will be skipped."); - } + Geneset geneset = DaoGeneset.getGenesetByExternalId(genesetId); + if (geneset != null) { + storedRecord = storeGeneticEntityGeneticAlterations(values, geneset.getGeneticEntityId(), EntityType.GENESET, geneset.getExternalId()); + } + else { + ProgressMonitor.logWarning("Geneset " + genesetId + " not found in DB. Record will be skipped."); } return storedRecord; } /** * Parses line for generic assay profile record and stores record in 'genetic_alteration' table. - * @param line row from the separated-text that contains one or more values on a single sample - * @param nrColumns - * @param sampleStartIndex index of the first sample column - * @param genericAssayIdIndex index of the column that uniquely identifies a sample - * @param filteredSampleIndices - * @param daoGeneticAlteration - * @return - * @throws DaoException */ - - private boolean parseGenericAssayLine(String line, int nrColumns, int sampleStartIndex, int genericAssayIdIndex, - List filteredSampleIndices, DaoGeneticAlteration daoGeneticAlteration, Map genericAssayStableIdToEntityIdMap) throws DaoException { + private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map genericAssayStableIdToEntityIdMap) { boolean recordIsStored = false; - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); - - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } + Integer entityId = genericAssayStableIdToEntityIdMap.getOrDefault(genericAssayId, null); - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - - // trim whitespace from values - values = Stream.of(values).map(String::trim).toArray(String[]::new); - values = filterOutNormalValues(filteredSampleIndices, values); - - String stableId = parts[genericAssayIdIndex]; - Integer entityId = genericAssayStableIdToEntityIdMap.getOrDefault(stableId, null); - - if (entityId == null) { - ProgressMonitor.logWarning("Generic Assay entity " + parts[genericAssayIdIndex] + " not found in DB. Record will be skipped."); - } else { - recordIsStored = storeGeneticEntityGeneticAlterations(values, daoGeneticAlteration, entityId, - EntityType.GENERIC_ASSAY, stableId); - } - - return recordIsStored; + if (entityId == null) { + ProgressMonitor.logWarning("Generic Assay entity " + genericAssayId + " not found in DB. Record will be skipped."); + } else { + recordIsStored = storeGeneticEntityGeneticAlterations(values, entityId, EntityType.GENERIC_ASSAY, genericAssayId); } return recordIsStored; @@ -816,14 +750,12 @@ private boolean parseGenericAssayLine(String line, int nrColumns, int sampleStar /** * Stores genetic alteration data for a genetic entity. * @param values - * @param daoGeneticAlteration * @param geneticEntityId - internal id for genetic entity * @param geneticEntityType - "GENE", "GENESET", "PHOSPHOPROTEIN" * @param geneticEntityName - hugo symbol for "GENE", external id for "GENESET", phospho gene name for "PHOSPHOPROTEIN" * @return boolean indicating if record was stored successfully or not */ - private boolean storeGeneticEntityGeneticAlterations(String[] values, DaoGeneticAlteration daoGeneticAlteration, - Integer geneticEntityId, EntityType geneticEntityType, String geneticEntityName) { + private boolean storeGeneticEntityGeneticAlterations(String[] values, Integer geneticEntityId, EntityType geneticEntityType, String geneticEntityName) { try { if (importedGeneticEntitySet.add(geneticEntityId)) { daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values); diff --git a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java index 744ca565..2e767618 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java @@ -43,30 +43,6 @@ * @author Ethan Cerami. */ public class FileUtil { - /** - * BioPAX File Type. - */ - public static final int BIOPAX = 0; - - /** - * PSI_MI File Type. - */ - public static final int PSI_MI = 1; - - /** - * External DBs File Type. - */ - public static final int EXTERNAL_DBS = 2; - - /** - * Identifiers File Type. - */ - public static final int IDENTIFIERS = 3; - - /** - * Unknown File Type. - */ - public static final int UNKNOWN = 4; /** * Gets Number of Lines in Specified File. @@ -77,32 +53,26 @@ public class FileUtil { */ public static int getNumLines(File file) throws IOException { int numLines = 0; - FileReader reader = new FileReader(file); - BufferedReader buffered = new BufferedReader(reader); - String line = buffered.readLine(); - while (line != null) { - if (!line.startsWith("#") && line.trim().length() > 0) { - numLines++; + try (FileReader reader = new FileReader(file); BufferedReader buffered = new BufferedReader(reader)) { + String line = buffered.readLine(); + while (line != null) { + if (isInfoLine(line)) { + numLines++; + } + line = buffered.readLine(); } - line = buffered.readLine(); + return numLines; } - reader.close(); - return numLines; } /** - * Gets Next Line of Input. Filters out Empty Lines and Comments. - * - * @param buf BufferedReader Object. - * @return next line of input. - * @throws IOException Error reading input stream. + * Does line brings any information? + * e.g. blank like and comments do not + * @param line + * @return */ - public static String getNextLine(BufferedReader buf) throws IOException { - String line = buf.readLine(); - while (line != null && (line.trim().length() == 0 - || line.trim().startsWith("#"))) { - line = buf.readLine(); - } - return line; + public static boolean isInfoLine(String line) { + return !line.startsWith("#") && line.trim().length() > 0; } + } \ No newline at end of file diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java index 33779cd3..800a368e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java @@ -38,7 +38,6 @@ import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; -import org.mskcc.cbio.portal.dao.DaoGeneset; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.dao.DaoPatient; @@ -48,15 +47,12 @@ import org.mskcc.cbio.portal.model.CancerStudy; import org.mskcc.cbio.portal.model.CanonicalGene; import org.mskcc.cbio.portal.model.CopyNumberStatus; -import org.mskcc.cbio.portal.model.Geneset; import org.mskcc.cbio.portal.model.GeneticAlterationType; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.model.Sample; -import org.mskcc.cbio.portal.scripts.ImportGenesetData; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.mskcc.cbio.portal.util.ConsoleUtil; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -171,9 +167,8 @@ private void runImportCnaData() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 999999207); assertEquals ("0", value); @@ -236,9 +231,8 @@ private void runImportCnaData2() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test2.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 207); assertEquals (value, "0"); @@ -321,9 +315,8 @@ private void runImportRnaData1() throws DaoException, IOException{ // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/mrna_test.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); ConsoleUtil.showMessages(); int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "DD639").getInternalId(); @@ -375,9 +368,8 @@ public void testImportmRnaData2() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_expression2.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); // check if expected warnings are given: ArrayList warnings = ProgressMonitor.getWarnings(); @@ -468,9 +460,8 @@ public void testImportRppaData() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_rppa.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); ConsoleUtil.showMessages(); int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "SAMPLE1").getInternalId(); From a7aab3a686332208dcfe855afed4d41bfccb443f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 3 May 2024 16:24:07 +0200 Subject: [PATCH 044/130] Implement incremental upload of mRNA data --- .../cbio/portal/dao/DaoGeneticAlteration.java | 19 ++- .../scripts/GeneticAlterationImporter.java | 4 + .../portal/scripts/ImportProfileData.java | 2 + .../portal/scripts/ImportTabDelimData.java | 161 +++++++++++++++--- .../org/mskcc/cbio/portal/util/CnaUtil.java | 3 +- .../TestIncrementalTabDelimData.java | 139 +++++++++++++++ .../scripts/TestImportTabDelimData.java | 10 +- .../data_expression_Zscores.txt | 24 +++ 8 files changed, 325 insertions(+), 37 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java create mode 100644 src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index 25eca11c..fcc2380e 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -94,7 +94,8 @@ public int addGeneticAlterations(int geneticProfileId, long entrezGeneId, String throws DaoException { return addGeneticAlterationsForGeneticEntity(geneticProfileId, DaoGeneOptimized.getGeneticEntityId(entrezGeneId), values); } - + + // TODO inc: update instead public int addGeneticAlterationsForGeneticEntity(int geneticProfileId, int geneticEntityId, String[] values) throws DaoException { @@ -237,8 +238,8 @@ public HashMap> getGeneticAlterationMapForEntit int geneticEntityId = rs.getInt("GENETIC_ENTITY_ID"); String values = rs.getString("VALUES"); //hm.debug.. - String valueParts[] = values.split(DELIM); - for (int i=0; i getProcessedAlterationData( rs = pstmt.executeQuery(); while (rs.next()) { long entrezGeneId = DaoGeneOptimized.getEntrezGeneId(rs.getInt("GENETIC_ENTITY_ID")); - String[] values = rs.getString("VALUES").split(DELIM); + String valuesString = rs.getString("VALUES"); + if (valuesString.endsWith(DELIM)) { + valuesString = valuesString.substring(0, valuesString.length() - DELIM.length()); + } + String[] values = valuesString.split(DELIM, -1); ObjectNode datum = processor.process( entrezGeneId, values, @@ -425,17 +430,19 @@ public int getCount() throws DaoException { * Deletes all Genetic Alteration Records associated with the specified Genetic Profile ID. * * @param geneticProfileId Genetic Profile ID. + * @param geneticEntityId Genetic Entity ID. * @throws DaoException Database Error. */ - public void deleteAllRecordsInGeneticProfile(long geneticProfileId) throws DaoException { + public void deleteAllRecordsInGeneticProfile(long geneticProfileId, long geneticEntityId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoGeneticAlteration.class); pstmt = con.prepareStatement("DELETE from " + - "genetic_alteration WHERE GENETIC_PROFILE_ID=?"); + "genetic_alteration WHERE GENETIC_PROFILE_ID=? and GENETIC_ENTITY_ID=?"); pstmt.setLong(1, geneticProfileId); + pstmt.setLong(2, geneticEntityId); pstmt.executeUpdate(); } catch (SQLException e) { throw new DaoException(e); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index 32aa43f2..623b3122 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -55,5 +55,9 @@ public boolean store( } } + public boolean isImportedAlready(CanonicalGene gene) { + return importSetOfGenes.contains(gene.getEntrezGeneId()); + } + } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index e4b11844..d34ab2cc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -127,6 +127,7 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), + false, daoGeneticAlteration, daoGene ); genericAssayProfileImporter.importData(); @@ -151,6 +152,7 @@ public void run() { geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, + false, daoGeneticAlteration, daoGene ); String pdAnnotationsFilename = geneticProfile.getOtherMetaDataField("pd_annotations_filename"); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 81300e63..76ffcb0b 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -36,6 +36,7 @@ import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -74,6 +75,11 @@ public class ImportTabDelimData { private DaoGeneOptimized daoGene; + private boolean updateMode; + private HashMap> geneticAlterationMap; + private ArrayList orderedImportedSampleList; + private ArrayList orderedSampleList; + /** * Constructor. * @@ -83,7 +89,8 @@ public class ImportTabDelimData { * @param geneticProfileId GeneticProfile ID. * @param genePanel GenePanel * @param genericEntityProperties Generic Assay Entities. - * + * @param updateMode if true, update/append data to the existing one + * * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ public ImportTabDelimData( @@ -92,10 +99,11 @@ public ImportTabDelimData( int geneticProfileId, String genePanel, String genericEntityProperties, + boolean updateMode, DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { - this(dataFile, targetLine, geneticProfileId, genePanel, daoGeneticAlteration, daoGene); + this(dataFile, targetLine, geneticProfileId, genePanel, updateMode, daoGeneticAlteration, daoGene); this.genericEntityProperties = genericEntityProperties; } @@ -106,7 +114,8 @@ public ImportTabDelimData( * @param targetLine The line we want to import. * If null, all lines are imported. * @param geneticProfileId GeneticProfile ID. - * + * @param updateMode if true, update/append data to the existing one + * * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ public ImportTabDelimData( @@ -114,16 +123,18 @@ public ImportTabDelimData( String targetLine, int geneticProfileId, String genePanel, + boolean updateMode, DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { - this(dataFile, geneticProfileId, genePanel, daoGeneticAlteration, daoGene); + this(dataFile, geneticProfileId, genePanel, updateMode, daoGeneticAlteration, daoGene); this.targetLine = targetLine; } /** * Constructor. * + * @param updateMode if true, update/append data to the existing one * @param dataFile Data File containing Copy Number Alteration, MRNA Expression Data, or protein RPPA data * @param geneticProfileId GeneticProfile ID. */ @@ -131,15 +142,18 @@ public ImportTabDelimData( File dataFile, int geneticProfileId, String genePanel, + boolean updateMode, DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; this.genePanel = genePanel; + this.updateMode = updateMode; this.daoGeneticAlteration = daoGeneticAlteration; this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); this.daoGene = daoGene; + this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); } /** @@ -154,10 +168,10 @@ public void importData() throws IOException, DaoException { } catch (IOException e) { throw new RuntimeException(e); } + if (updateMode) { + geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfile.getGeneticProfileId(), null); + } ProgressMonitor.setMaxValue(numLines); - - geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); - FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); String headerLine = buf.readLine(); @@ -214,9 +228,9 @@ public void importData() throws IOException, DaoException { pdAnnotationsForStableSampleIds = readPdAnnotations(this.pdAnnotationsFile); } // link Samples to the genetic profile - ArrayList orderedSampleList = new ArrayList(); ArrayList filteredSampleIndices = new ArrayList(); this.pdAnnotations = new HashMap<>(); + this.orderedSampleList = new ArrayList<>(); for (int i = 0; i < sampleIds.length; i++) { Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), StableIdUtil.getSampleId(sampleIds[i])); @@ -231,10 +245,7 @@ public void importData() throws IOException, DaoException { throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(sampleIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath()); } } - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); - } + ensureSampleGeneticProfile(sample); orderedSampleList.add(sample.getInternalId()); if (pdAnnotationsForStableSampleIds != null) { Set> keys = new HashSet<>(pdAnnotationsForStableSampleIds.keySet()); @@ -259,7 +270,7 @@ public void importData() throws IOException, DaoException { } ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines - 1)); - DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + saveOrderedSampleList(); //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); @@ -331,7 +342,7 @@ public void importData() throws IOException, DaoException { if (targetLine == null || firstCellValue.equals(targetLine)) { recordAdded = saveLine(sampleValues, entrez, geneSymbol, - isRppaProfile, isDiscretizedCnaProfile, orderedSampleList, + isRppaProfile, isDiscretizedCnaProfile, existingCnaEvents); } } @@ -350,6 +361,7 @@ public void importData() throws IOException, DaoException { line = buf.readLine(); } + expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); if (MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); } @@ -371,6 +383,66 @@ public void importData() throws IOException, DaoException { } } + private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { + if (updateMode) { + // Expand remaining genetic entity id rows that were not mentioned in the file + new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { + try { + String[] values = orderedImportedSampleList.stream() + .map(sampleId -> + geneticAlterationMap.get(geneticEntityId).containsKey(sampleId) ? + geneticAlterationMap.get(geneticEntityId).get(sampleId) : "") + .toArray(String[]::new); + + saveValues(geneticEntityId, values); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }); + } + } + + private void ensureSampleGeneticProfile(Sample sample) throws DaoException { + if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { + Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); + if (updateMode) { + DaoSampleProfile.deleteRecords(List.of(sample.getInternalId()), List.of(geneticProfileId)); + } + DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); + } + } + + private void saveOrderedSampleList() throws DaoException { + if (updateMode) { + ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(geneticProfileId); + // add all new sample ids at the end + ArrayList extendedSampleList = new ArrayList<>(savedOrderedSampleList); + List newSampleIds = orderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); + extendedSampleList.addAll(newSampleIds); + orderedImportedSampleList = orderedSampleList; + orderedSampleList = extendedSampleList; + + + DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(geneticProfileId); + } + DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + } + + //TODO move somewhere else + private Map zip(K[] keys, V[] values) { + Map map = new HashMap<>(); + + // Check if both arrays have the same length + if (keys.length == values.length) { + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + } else { + throw new IllegalArgumentException("Arrays must be of the same length"); + } + return map; + } + private Map, Map> readPdAnnotations(File pdAnnotationsFile) { Map, Map> pdAnnotations = new HashMap<>(); BufferedReader reader; @@ -524,7 +596,6 @@ private Map, Map> readPdAnnotations(File * * @param isRppaProfile true if this is an rppa profile (i.e. alteration type is PROTEIN_LEVEL and the first column is Composite.Element.Ref) * @param isDiscretizedCnaProfile true if this is a discretized CNA profile (i.e. alteration type COPY_NUMBER_ALTERATION and showProfileInAnalysisTab is true) - * @param orderedSampleList a list of the internal sample ids corresponding to the sample names in the header line * @param existingCnaEvents a collection of CnaEvents, to be added to or updated during parsing of individual lines * @return true if any record was stored in genetic_alteration, else false * @throws DaoException if any DaoException is thrown while using daoGene or daoGeneticAlteration @@ -534,7 +605,6 @@ private boolean saveLine(String[] values, String geneSymbol, boolean isRppaProfile, boolean isDiscretizedCnaProfile, - List orderedSampleList, Set existingCnaEvents ) throws DaoException { @@ -600,7 +670,7 @@ private boolean saveLine(String[] values, if (!microRNAGenes.isEmpty()) { // for micro rna, duplicate the data for (CanonicalGene gene : microRNAGenes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + if (this.saveValues(gene, values)) { recordStored = true; } } @@ -620,14 +690,14 @@ private boolean saveLine(String[] values, // none of the matched genes are type "miRNA" if (genes.size() == 1) { // Store all values per gene: - recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); + recordStored = this.saveValues(genes.get(0), values); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored && isDiscretizedCnaProfile) { - CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, orderedSampleList, genes)); + CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, genes)); } } else { if (isRppaProfile) { // for protein data, duplicate the data - recordStored = saveRppaValues(values, geneSymbol, recordStored, genes); + recordStored = saveRppaValues(values, recordStored, genes); } else { if (!recordStored) { // this case : @@ -640,9 +710,51 @@ private boolean saveLine(String[] values, return recordStored; } - private boolean saveRppaValues(String[] values, String geneSymbol, boolean recordStored, List genes) throws DaoException { + private boolean saveValues(CanonicalGene canonicalGene, String[] values) throws DaoException { + //TODO Think of better way. We do that to do not remove genes that contain duplicate + if (geneticAlterationImporter.isImportedAlready(canonicalGene)) { + return false; + } + if (updateMode) { + values = updateValues(canonicalGene.getGeneticEntityId(), values); + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); + } + return geneticAlterationImporter.store(values, canonicalGene, canonicalGene.getHugoGeneSymbolAllCaps()); + } + //TODO unify saveValues versions + // With update mode the last duplicate wins. It's different from the other function + private boolean saveValues(int geneticEntityId, String[] values) throws DaoException { + if (updateMode) { + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); + values = updateValues(geneticEntityId, values); + } + return daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values) > 0; + } + + private String[] updateValues(int geneticEntityId, String[] values) { + //TODO swap variables + Map sampleIdToValue = zip(orderedImportedSampleList.toArray(new Integer[0]), values); + String[] updatedSampleValues = new String[orderedSampleList.size()]; + for (int i = 0; i < orderedSampleList.size(); i++) { + updatedSampleValues[i] = ""; + int sampleId = orderedSampleList.get(i); + if (geneticAlterationMap.containsKey(geneticEntityId)) { + HashMap savedSampleIdToValue = geneticAlterationMap.get(geneticEntityId); + updatedSampleValues[i] = savedSampleIdToValue.containsKey(sampleId) ? savedSampleIdToValue.remove(sampleId): ""; + if (savedSampleIdToValue.isEmpty()) { + geneticAlterationMap.remove(geneticEntityId); + } + } + if (sampleIdToValue.containsKey(sampleId)) { + updatedSampleValues[i] = sampleIdToValue.get(sampleId); + } + } + return updatedSampleValues; + } + + private boolean saveRppaValues(String[] values, boolean recordStored, List genes) throws DaoException { for (CanonicalGene gene : genes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + if (this.saveValues(gene, values)) { recordStored = true; nrExtraRecords++; } @@ -677,7 +789,7 @@ private List parseGenes(String entrez, String geneSymbol) { return List.of(); } - private List composeCnaEventsToAdd(String[] values, List orderedSampleList, List genes) { + private List composeCnaEventsToAdd(String[] values, List genes) { List cnaEventsToAdd = new ArrayList(); long entrezGeneId = genes.get(0).getEntrezGeneId(); for (int i = 0; i < values.length; i++) { @@ -758,8 +870,7 @@ private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map private boolean storeGeneticEntityGeneticAlterations(String[] values, Integer geneticEntityId, EntityType geneticEntityType, String geneticEntityName) { try { if (importedGeneticEntitySet.add(geneticEntityId)) { - daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values); - return true; + return saveValues(geneticEntityId, values); } else { ProgressMonitor.logWarning("Data for genetic entity " + geneticEntityName diff --git a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java index 3cc6fd71..de7fe85a 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java @@ -45,6 +45,7 @@ public CnaUtil(String[] headerParts, Set namespaces) { this.namespaceColumnParser = new NamespaceColumnParser(namespaces, headerParts); } + // TODO inc: update public static void storeCnaEvents( Set existingCnaEvents, List cnaEventsToAdd @@ -53,7 +54,7 @@ public static void storeCnaEvents( if (!CNA.AMP.equals(cnaEvent.getAlteration()) && !CNA.HOMDEL.equals(cnaEvent.getAlteration())) { continue; } - + // TODO Clean cnv event // Revert PR https://github.com/cBioPortal/cbioportal-core/pull/1 breaks importer Optional existingCnaEvent = existingCnaEvents .stream() diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java new file mode 100644 index 00000000..2891373b --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -0,0 +1,139 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.scripts.ImportTabDelimData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; + +/** + * Tests Incremental Import of Tab Delimited Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalTabDelimData { + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } + + // Hugo_Symbol: CDK1 + static final long NEW_GENE_ENTREZ_ID = 983l; + + /** + * Gene that is part of the platform, but absent during the incremental upload + */ + // Hugo_Symbol: ARAF + static final long ABSENT_GENE_ENTREZ_ID = 369l; + static final Set TEST_ENTREZ_GENE_IDS = Set.of(10000l, 207l, 208l, 3265l, ABSENT_GENE_ENTREZ_ID, 3845l, 472l, 4893l, 672l, 673l, 675l, NEW_GENE_ENTREZ_ID); + + // stable_id: TCGA-A1-A0SB-01 + static final int NEW_SAMPLE_ID = 1; + + // stable_id: TCGA-A1-A0SD-01 + static final int UPDATED_SAMPLE_ID = 2; + static final Set TEST_SAMPLE_IDS = Set.of(NEW_SAMPLE_ID, UPDATED_SAMPLE_ID, 3, 6, 8, 9, 10, 12, 13); + + /** + * Test incremental upload of MRNA_EXPRESSION + */ + @Test + public void testMrnaExpression() throws DaoException, IOException { + /** + * Prior checks + */ + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); + assertNotNull(mrnaProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); + assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); + beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); + assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); + }); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); + + /** + * Test + */ + new ImportTabDelimData(dataFile, + mrnaProfile.getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance()).importData(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); + afterResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); + if (entrezGeneId == NEW_GENE_ENTREZ_ID || entrezGeneId == ABSENT_GENE_ENTREZ_ID) { + return; + } + sampleIdToValue.forEach((sampleId, value) -> { + if (sampleId == NEW_SAMPLE_ID || sampleId == UPDATED_SAMPLE_ID) { + return; + } + assertEquals("The associated value is not expected change associated sample id " + sampleId + " and entrez gene id " + entrezGeneId, + beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); + }); + }); + assertEquals("-0.1735", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("-0.6412", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("-1.12475", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java index 800a368e..f8bcc335 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java @@ -167,7 +167,7 @@ private void runImportCnaData() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 999999207); @@ -231,7 +231,7 @@ private void runImportCnaData2() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test2.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 207); @@ -315,7 +315,7 @@ private void runImportRnaData1() throws DaoException, IOException{ // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/mrna_test.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); ConsoleUtil.showMessages(); @@ -368,7 +368,7 @@ public void testImportmRnaData2() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_expression2.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); // check if expected warnings are given: @@ -460,7 +460,7 @@ public void testImportRppaData() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_rppa.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); ConsoleUtil.showMessages(); diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt new file mode 100644 index 00000000..02652403 --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -0,0 +1,24 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 +AKT3 10000 0.6393 0.5377 +AKT1 207 0.7850 0.0426 +# The pipe and after have to be removed +AKT2|TEST 208 1.0741 0.7180 +HRAS 3265 -0.1735 -0.6412 +# This gene absent in this file, but it's still part of the profile and has to be updated +#ARAF 369 0.6393 0.5377 +KRAS 3845 0.7850 0.0426 +ATM 472 1.0741 0.7180 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 -0.1735 -0.6412 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 0.6393 0.5377 +BRAF 673 0.7850 0.0426 +# Duplicate lines should be ignored +BRAF 673 0.7851 0.0427 +BRCA2 675 1.0741 0.7180 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 -0.1735 -0.6412 +# These lines have to be skipped +/// 369 0.6393 0.5377 +--- 3845 0.7850 0.0426 + 1.0741 0.7180 From bd2d8c1c038d3258390f90553b066300f7d24741 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 10:11:40 +0200 Subject: [PATCH 045/130] Add RPPA test --- .../TestIncrementalTabDelimData.java | 55 +++++++++++++++++++ .../data_expression_Zscores.txt | 2 +- .../incremental/tab_delim_data/data_rppa.txt | 24 ++++++++ src/test/resources/seed_mini.sql | 14 +++++ 4 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 src/test/resources/incremental/tab_delim_data/data_rppa.txt diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index 2891373b..f57e5031 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -136,4 +136,59 @@ public void testMrnaExpression() throws DaoException, IOException { assertEquals("-1.12475", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); } + /** + * Test incremental upload of PROTEIN_LEVEL + */ + @Test + public void testRppa() throws DaoException, IOException { + /** + * Prior checks + */ + GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); + assertNotNull(rppaProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); + assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); + beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); + assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); + }); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_rppa.txt"); + + /** + * Test + */ + new ImportTabDelimData(dataFile, + rppaProfile.getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance()).importData(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); + afterResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); + if (entrezGeneId == NEW_GENE_ENTREZ_ID || entrezGeneId == ABSENT_GENE_ENTREZ_ID) { + return; + } + sampleIdToValue.forEach((sampleId, value) -> { + if (sampleId == NEW_SAMPLE_ID || sampleId == UPDATED_SAMPLE_ID) { + return; + } + assertEquals("The associated value is not expected change associated sample id " + sampleId + " and entrez gene id " + entrezGeneId, + beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); + }); + }); + assertEquals("-0.141047088398489", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("1.61253243564957", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("-1.129", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + } + } diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 02652403..1ca71772 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -1,7 +1,7 @@ Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 AKT3 10000 0.6393 0.5377 AKT1 207 0.7850 0.0426 -# The pipe and after have to be removed +# All after the pipe has to be removed AKT2|TEST 208 1.0741 0.7180 HRAS 3265 -0.1735 -0.6412 # This gene absent in this file, but it's still part of the profile and has to be updated diff --git a/src/test/resources/incremental/tab_delim_data/data_rppa.txt b/src/test/resources/incremental/tab_delim_data/data_rppa.txt new file mode 100644 index 00000000..bc3b858a --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_rppa.txt @@ -0,0 +1,24 @@ +Composite.Element.REF TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 +AKT3|akt3 1.26122710480548 0.037186254715365 +# Multiple gene symbols joined by space +AKT1 AKT2 AKT3|akt1 1.61253243664957 -0.141077088398489 +# All after the pipe has to be removed +AKT2|TEST 5.4424238579025E-05 0.062264661774981 +HRAS|hras 0.37624053370992 0.270399126328659 +# This gene absent in this file, but it's still part of the profile and has to be updated 0.407622077164699 -0.326522823583974 +#ARAF 0.383702820778609 0.218650367364756 +KRAS|kras -0.335040546938807 0.00730643372831408 +ATM|atm 0.037186254715365 1.26122710480548 +# This line missing the entrez id and the gene has to be detected by hugo symbol 0.062264661774981 5.4424238579025E-05 +BRCA1|brca1 0.270399126328659 0.37624053370992 +BRAF|braf -0.326522823583974 0.407622077164699 +# Duplicate lines should be ignored 0.218650367364756 0.383702820778609 +BRAF|braf 0.00730643372831408 -0.335040546938807 +BRCA2|brca2 -0.141077088398489 1.61253243664957 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1|cdk1 -0.141047088398489 1.61253243564957 +# These lines have to be skipped +/// -0.335040546938807 0.00730643372831408 +--- 0.037186254715365 1.26122710480548 + 0.064 0.644 +NA|K-Ras 0.062264661774981 5.4424238579025E-05 diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index 3dfd5ff9..545c85bd 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -328,6 +328,7 @@ INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (6,'study_tcga_pub_mutations',1,'MUTATION_EXTENDED','MAF','Mutations','Mutation data from whole exome sequencing.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (7,'study_tcga_pub_structural_variants',1,'STRUCTURAL_VARIANT','SV','Structural Variants','Structural Variants test data.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (8,'study_tcga_pub_cna_long',1,'COPY_NUMBER_ALTERATION','DISCRETE_LONG','CNA values','CNA values dummy data','1'); +INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (9,'study_tcga_pub_rppa',1,'PROTEIN_LEVEL','LOG2-VALUE','RPPA values','RPPA values dummy data','0'); -- gene_panel INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_CNA_DISCRETE_LONG_FORMAT','Some test panel'); @@ -375,12 +376,25 @@ INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALU INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 672),'0.066638638,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 673),'0.020369562,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 675),'0.793930197,'); +-- RPPA +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 10000),'-0.472,1.514,0.145,-0.183,0.913,-0.665,-1.700,0.976,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 207),'-1.102,-0.243,0.018,-0.154,0.330,1.005,0.681,-0.664,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 208),'-1.221,-0.592,-0.176,-0.310,-1.198,-0.670,0.077,-0.302,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 3265),'0.061,-0.055,-0.165,0.517,2.021,0.381,-0.728,0.944,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 369),'-1.129,-0.306,0.180,-0.601,0.166,0.402,0.243,-0.999,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 3845),'0.177,0.404,0.188,0.428,1.676,0.238,0.469,2.161,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 472),'-1.503,-1.925,-1.755,-1.576,-1.029,-1.401,-1.514,-2.074,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 4893),'-1.914,-2.059,-1.228,-1.322,-4.166,-1.187,0.284,-0.130,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 672),'-1.661,-1.392,-1.924,-1.656,-0.361,-1.998,-0.136,-0.709,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 673),'0.233,0.561,-0.106,-0.085,-0.012,0.143,0.141,0.609,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 675),'-0.570,-1.340,-1.544,-0.404,0.632,-1.231,0.771,-0.036,'); -- genetic_profile_samples INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (2,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (3,'2,3,6,8,9,10,12,13,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (4,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (5,'2,'); +INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (9,'2,3,6,8,9,10,12,13,'); -- patient INSERT INTO "patient" ("INTERNAL_ID","STABLE_ID","CANCER_STUDY_ID") VALUES (1,'TCGA-A1-A0SB',1); From 8b68331266359b73209da2d7114f6918bbdf232f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 10:24:37 +0200 Subject: [PATCH 046/130] Add normal sample to thest data to test skipping --- .../data_expression_Zscores.txt | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 1ca71772..4204bf1e 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -1,24 +1,24 @@ -Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 -AKT3 10000 0.6393 0.5377 -AKT1 207 0.7850 0.0426 -# All after the pipe has to be removed -AKT2|TEST 208 1.0741 0.7180 -HRAS 3265 -0.1735 -0.6412 -# This gene absent in this file, but it's still part of the profile and has to be updated -#ARAF 369 0.6393 0.5377 -KRAS 3845 0.7850 0.0426 -ATM 472 1.0741 0.7180 -# This line missing the hugo symbol and the gene has to be detected by entrez id - 4893 -0.1735 -0.6412 -# This line missing the entrez id and the gene has to be detected by hugo symbol -BRCA1 0.6393 0.5377 -BRAF 673 0.7850 0.0426 -# Duplicate lines should be ignored -BRAF 673 0.7851 0.0427 -BRCA2 675 1.0741 0.7180 -# This gene is new! the empty values should be set for the already existing samples in the database -CDK1 983 -0.1735 -0.6412 -# These lines have to be skipped -/// 369 0.6393 0.5377 ---- 3845 0.7850 0.0426 - 1.0741 0.7180 +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL TCGA-A1-A0SD-01 +AKT3 10000 0.6393 0.1 0.5377 +AKT1 207 0.785 0.1 0.0426 +# All after the pipe has to be removed +AKT2|TEST 208 1.0741 0.1 0.718 +HRAS 3265 -0.1735 0.1 -0.6412 +# This gene absent in this file, but it's still part of the profile and has to be updated +#ARAF 369 0.6393 0.1 0.5377 +KRAS 3845 0.785 0.1 0.0426 +ATM 472 1.0741 0.1 0.718 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 -0.1735 0.1 -0.6412 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 0.6393 0.1 0.5377 +BRAF 673 0.785 0.1 0.0426 +# Duplicate lines should be ignored +BRAF 673 0.7851 0.1 0.0427 +BRCA2 675 1.0741 0.1 0.718 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 -0.1735 0.1 -0.6412 +# These lines have to be skipped +/// 369 0.6393 0.1 0.5377 +--- 3845 0.785 0.1 0.0426 + 1.0741 0.1 0.718 From b18aab11e0b706cd2ba81fd9e80dfef088fe5e68 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 10:49:38 +0200 Subject: [PATCH 047/130] Add rows with more columns then in header to skip --- .../data_expression_Zscores.txt | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 4204bf1e..6764c6b1 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -1,24 +1,30 @@ Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL TCGA-A1-A0SD-01 AKT3 10000 0.6393 0.1 0.5377 AKT1 207 0.785 0.1 0.0426 -# All after the pipe has to be removed +# All after the pipe has to be removed AKT2|TEST 208 1.0741 0.1 0.718 HRAS 3265 -0.1735 0.1 -0.6412 -# This gene absent in this file, but it's still part of the profile and has to be updated +# This gene absent in this file, but it's still part of the profile and has to be updated #ARAF 369 0.6393 0.1 0.5377 KRAS 3845 0.785 0.1 0.0426 ATM 472 1.0741 0.1 0.718 -# This line missing the hugo symbol and the gene has to be detected by entrez id +# This line missing the hugo symbol and the gene has to be detected by entrez id 4893 -0.1735 0.1 -0.6412 -# This line missing the entrez id and the gene has to be detected by hugo symbol +# This line missing the entrez id and the gene has to be detected by hugo symbol BRCA1 0.6393 0.1 0.5377 BRAF 673 0.785 0.1 0.0426 -# Duplicate lines should be ignored +# Duplicate lines should be ignored BRAF 673 0.7851 0.1 0.0427 -BRCA2 675 1.0741 0.1 0.718 -# This gene is new! the empty values should be set for the already existing samples in the database +# Although this row has 2 extra columns, we are ok with that as they contain blank values +BRCA2 675 1.0741 0.1 0.718 +# This gene is new! the empty values should be set for the already existing samples in the database CDK1 983 -0.1735 0.1 -0.6412 -# These lines have to be skipped +# These lines have to be skipped +# One column too much +FGFR3 2261 0.045 0.1 0.675 0.0224575 +# Multigene sign /// 369 0.6393 0.1 0.5377 +# Unknown gene sign --- 3845 0.785 0.1 0.0426 +# Empty gene info 1.0741 0.1 0.718 From ea688c315d000b2450351cdcca94cea963f4f117 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 11:00:03 +0200 Subject: [PATCH 048/130] Skip rows that don't have enough sample columns --- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 4 +++- .../incremental/tab_delim_data/data_expression_Zscores.txt | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 76ffcb0b..5432c2ff 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -300,7 +300,9 @@ public void importData() throws IOException, DaoException { if (rowParts.length > headerColumns && line.split("\t").length > headerColumns) { ProgressMonitor.logWarning("Ignoring line with more fields (" + rowParts.length + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); - + } else if (rowParts.length < headerColumns) { + ProgressMonitor.logWarning("Ignoring line with less fields (" + rowParts.length + + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); } else { String sampleValues[] = ArrayUtils.subarray(rowParts, sampleStartIndex, rowParts.length > headerColumns ? headerColumns : rowParts.length); diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 6764c6b1..5c3c9012 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -22,6 +22,8 @@ CDK1 983 -0.1735 0.1 -0.6412 # These lines have to be skipped # One column too much FGFR3 2261 0.045 0.1 0.675 0.0224575 +# No sample columns +PIEZO1 9780 # Multigene sign /// 369 0.6393 0.1 0.5377 # Unknown gene sign From cdae5011077f5f4e275fc44a1c776226de63ae16 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 11:17:44 +0200 Subject: [PATCH 049/130] Test for invalid entrez id --- .../java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java | 3 ++- .../java/org/mskcc/cbio/portal/scripts/ImportGeneData.java | 2 +- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 3 +-- .../java/org/mskcc/cbio/portal/util/EntrezValidator.java | 7 +++++++ .../incremental/tab_delim_data/data_expression_Zscores.txt | 2 ++ 5 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java index 1f58acb7..3adbfb53 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java @@ -46,6 +46,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.EntrezValidator; import org.mskcc.cbio.portal.util.ProgressMonitor; /** @@ -322,7 +323,7 @@ public List guessGene(String geneId, String chr) { } CanonicalGene gene; - if (geneId.matches("[0-9]+")) { // likely to be a entrez gene id + if (EntrezValidator.isaValidEntrezId(geneId)) { // likely to be a entrez gene id gene = getGene(Integer.parseInt(geneId)); if (gene!=null) { return Collections.singletonList(gene); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java index 0ab8bd88..cc3300c0 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java @@ -188,7 +188,7 @@ public static void importHGNCData(File geneFile, String genomeBuild) throws IOEx ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); String parts[] = line.split("\t", -1); // include trailing empty strings - if (!parts[0].matches("[0-9]+")) { + if (!EntrezValidator.isaValidEntrezId(parts[0])) { ProgressMonitor.logWarning("Skipping gene with invalid entrez gene id '" + parts[1] + "'"); continue; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 5432c2ff..8627dce4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -336,8 +336,7 @@ public void importData() throws IOException, DaoException { if (entrez != null && entrez.isEmpty()) { entrez = null; } - if (entrez != null && !entrez.matches("[0-9]+")) { - //TODO - would be better to give an exception in some cases, like negative Entrez values + if (entrez != null && !EntrezValidator.isaValidEntrezId(entrez)) { ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); } else { String firstCellValue = rowParts[0]; diff --git a/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java b/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java new file mode 100644 index 00000000..335bfd66 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java @@ -0,0 +1,7 @@ +package org.mskcc.cbio.portal.util; + +public class EntrezValidator { + public static boolean isaValidEntrezId(String entrez) { + return entrez.matches("[0-9]+"); + } +} diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 5c3c9012..d7b646b0 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -24,6 +24,8 @@ CDK1 983 -0.1735 0.1 -0.6412 FGFR3 2261 0.045 0.1 0.675 0.0224575 # No sample columns PIEZO1 9780 +# invalid entrez id +P2RY10 -1 0.741 0.1 0.685 # Multigene sign /// 369 0.6393 0.1 0.5377 # Unknown gene sign From cf458a4d04f036626f9e1c038d58def0aa84fd22 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 11:55:01 +0200 Subject: [PATCH 050/130] Extract common code from inc. tab. delim. tests --- .../TestIncrementalTabDelimData.java | 55 +++++++------------ 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index f57e5031..a201901e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -17,6 +17,7 @@ package org.mskcc.cbio.portal.integrationTest.incremental; +import org.jetbrains.annotations.NotNull; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; @@ -25,10 +26,7 @@ import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; -import org.mskcc.cbio.portal.dao.DaoSample; -import org.mskcc.cbio.portal.model.CancerStudy; import org.mskcc.cbio.portal.model.GeneticProfile; -import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -38,7 +36,6 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; -import java.util.HashSet; import java.util.Set; import static org.junit.Assert.assertEquals; @@ -92,12 +89,7 @@ public void testMrnaExpression() throws DaoException, IOException { GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); assertNotNull(mrnaProfile); HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); - assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); - beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); - assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); - }); + assertPriorDataState(beforeResult); File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); @@ -116,20 +108,7 @@ public void testMrnaExpression() throws DaoException, IOException { * After test assertions */ HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); - afterResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); - if (entrezGeneId == NEW_GENE_ENTREZ_ID || entrezGeneId == ABSENT_GENE_ENTREZ_ID) { - return; - } - sampleIdToValue.forEach((sampleId, value) -> { - if (sampleId == NEW_SAMPLE_ID || sampleId == UPDATED_SAMPLE_ID) { - return; - } - assertEquals("The associated value is not expected change associated sample id " + sampleId + " and entrez gene id " + entrezGeneId, - beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); - }); - }); + assertNoChange(beforeResult, afterResult); assertEquals("-0.1735", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); assertEquals("-0.6412", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); @@ -147,12 +126,7 @@ public void testRppa() throws DaoException, IOException { GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); assertNotNull(rppaProfile); HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); - assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); - beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); - assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); - }); + assertPriorDataState(beforeResult); File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); File dataFile = new File(dataFolder, "data_rppa.txt"); @@ -171,6 +145,23 @@ public void testRppa() throws DaoException, IOException { * After test assertions */ HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertNoChange(beforeResult, afterResult); + assertEquals("-0.141047088398489", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("1.61253243564957", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("-1.129", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + } + + private void assertPriorDataState(HashMap> beforeResult) { + assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); + assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); + beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); + assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); + }); + } + + private void assertNoChange(HashMap> beforeResult, HashMap> afterResult) { assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); afterResult.forEach((entrezGeneId, sampleIdToValue) -> { assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); @@ -185,10 +176,6 @@ public void testRppa() throws DaoException, IOException { beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); }); }); - assertEquals("-0.141047088398489", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("1.61253243564957", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); - assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("-1.129", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); } } From 9ea1adacc0bceb912e03f5b9961b2093374bf42d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 11:32:53 +0200 Subject: [PATCH 051/130] Implement incremntal upload of cna data via tab. delim. loader --- .../mskcc/cbio/portal/dao/DaoCnaEvent.java | 27 +- .../portal/scripts/ImportTabDelimData.java | 13 +- .../TestIncrementalTabDelimData.java | 253 ++++++++++++++---- .../tab_delim_data/data_cna_discrete.txt | 17 ++ .../data_cna_pd_annotations.txt | 7 + 5 files changed, 263 insertions(+), 54 deletions(-) create mode 100644 src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt create mode 100644 src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java index e7785d4f..0e4ab7e8 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java @@ -119,7 +119,32 @@ private static long addCnaEventDirectly(CnaEvent cnaEvent) throws DaoException { JdbcUtil.closeAll(DaoCnaEvent.class, con, pstmt, rs); } } - + + public static void removeSampleCnaEvents(int cnaProfileId, List sampleIds) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoCnaEvent.class); + pstmt = con.prepareStatement + ("DELETE sample_cna_event, alteration_driver_annotation" + + " FROM sample_cna_event" + + " JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + + " WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" + + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + + ")"); + pstmt.setInt(1, cnaProfileId); + for (int i = 0; i < sampleIds.size(); i++) { + pstmt.setInt(i + 2, sampleIds.get(i)); + } + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoCnaEvent.class, con, pstmt, rs); + } + } + public static Map> getSamplesWithAlterations( Collection eventIds) throws DaoException { return getSamplesWithAlterations(StringUtils.join(eventIds, ",")); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 8627dce4..86c5fef8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -36,7 +36,6 @@ import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -694,7 +693,11 @@ private boolean saveLine(String[] values, recordStored = this.saveValues(genes.get(0), values); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored && isDiscretizedCnaProfile) { - CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, genes)); + if (updateMode) { + DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedImportedSampleList); + } + long entrezGeneId = genes.get(0).getEntrezGeneId(); + CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, entrezGeneId)); } } else { if (isRppaProfile) { // for protein data, duplicate the data @@ -790,9 +793,11 @@ private List parseGenes(String entrez, String geneSymbol) { return List.of(); } - private List composeCnaEventsToAdd(String[] values, List genes) { + private List composeCnaEventsToAdd(String[] values, long entrezGeneId) { + if (updateMode) { + values = updateValues((int) entrezGeneId, values); + } List cnaEventsToAdd = new ArrayList(); - long entrezGeneId = genes.get(0).getEntrezGeneId(); for (int i = 0; i < values.length; i++) { // temporary solution -- change partial deletion back to full deletion. diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index a201901e..64c4f4a6 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -17,15 +17,18 @@ package org.mskcc.cbio.portal.integrationTest.incremental; -import org.jetbrains.annotations.NotNull; +import org.cbioportal.model.CNA; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticEventImpl; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.springframework.test.annotation.Rollback; @@ -35,14 +38,17 @@ import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; /** * Tests Incremental Import of Tab Delimited Data. @@ -61,23 +67,6 @@ public void setUp() throws DaoException { DaoCancerStudy.reCacheAll(); } - // Hugo_Symbol: CDK1 - static final long NEW_GENE_ENTREZ_ID = 983l; - - /** - * Gene that is part of the platform, but absent during the incremental upload - */ - // Hugo_Symbol: ARAF - static final long ABSENT_GENE_ENTREZ_ID = 369l; - static final Set TEST_ENTREZ_GENE_IDS = Set.of(10000l, 207l, 208l, 3265l, ABSENT_GENE_ENTREZ_ID, 3845l, 472l, 4893l, 672l, 673l, 675l, NEW_GENE_ENTREZ_ID); - - // stable_id: TCGA-A1-A0SB-01 - static final int NEW_SAMPLE_ID = 1; - - // stable_id: TCGA-A1-A0SD-01 - static final int UPDATED_SAMPLE_ID = 2; - static final Set TEST_SAMPLE_IDS = Set.of(NEW_SAMPLE_ID, UPDATED_SAMPLE_ID, 3, 6, 8, 9, 10, 12, 13); - /** * Test incremental upload of MRNA_EXPRESSION */ @@ -86,10 +75,28 @@ public void testMrnaExpression() throws DaoException, IOException { /** * Prior checks */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); assertNotNull(mrnaProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); @@ -108,11 +115,18 @@ public void testMrnaExpression() throws DaoException, IOException { * After test assertions */ HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertNoChange(beforeResult, afterResult); - assertEquals("-0.1735", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("-0.6412", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); - assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("-1.12475", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + HashMap newGeneRow = afterResult.get(newGeneEntrezId); + assertEquals("-0.1735", newGeneRow.get(newSampleId)); + assertEquals("-0.6412", newGeneRow.get(updateSampleId)); + HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); + assertEquals("", absentGeneRow.get(newSampleId)); + assertEquals("-1.12475", absentGeneRow.get(updateSampleId)); } /** @@ -123,10 +137,28 @@ public void testRppa() throws DaoException, IOException { /** * Prior checks */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); assertNotNull(rppaProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); File dataFile = new File(dataFolder, "data_rppa.txt"); @@ -145,34 +177,157 @@ public void testRppa() throws DaoException, IOException { * After test assertions */ HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertNoChange(beforeResult, afterResult); - assertEquals("-0.141047088398489", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("1.61253243564957", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); - assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("-1.129", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("-1.129", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + } + + /** + * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE (gistic) + */ + @Test + public void testDiscreteCNA() throws DaoException, IOException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ATM + final long absentGeneEntrezId = 472l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-XX-0800 + final int newSampleId = 15; + // stable_id: TCGA-A1-A0SO + final int updateSampleId = 12; + final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + final Set afterSampleIds = new HashSet<>(beforeSampleIds); + afterSampleIds.add(newSampleId); + + GeneticProfile discreteCNAProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); + assertNotNull(discreteCNAProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + List allCnaLevels = Arrays.stream(CNA.values()).map(CNA::getCode).toList(); + Set beforeCnaEventsSampleIds = Set.of(4, 13, 14, updateSampleId); + List beforeSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + null, + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> beforeSampleIdToSampleCnaEvents = beforeSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals(beforeCnaEventsSampleIds, beforeSampleIdToSampleCnaEvents.keySet()); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_cna_discrete.txt"); + File pdAnnotations = new File(dataFolder, "data_cna_pd_annotations.txt"); + + /** + * Test + */ + ImportTabDelimData importer = new ImportTabDelimData(dataFile, + discreteCNAProfile.getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance()); + importer.setPdAnnotationsFile(pdAnnotations); + importer.importData(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("1", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + + List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + afterResult.keySet(), + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> afterSampleIdToSampleCnaEvents = afterSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals("There is only one new sample that has to gain cna events", beforeCnaEventsSampleIds.size() + 1, afterSampleIdToSampleCnaEvents.size()); + beforeCnaEventsSampleIds.forEach(sampleId -> { + if (sampleId == updateSampleId) { + return; + } + Set beforeCnaEvents = beforeSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + Set afterCnaEvents = afterSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + assertEquals("CNA events for sample_id=" + sampleId + " must not change.", beforeCnaEvents, afterCnaEvents); + }); + Map newSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(newSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 208l, CNA.HOMDEL, + 3265l, CNA.AMP, + 4893l, CNA.HOMDEL, + 672l, CNA.AMP, + 673l, CNA.AMP, + 675l, CNA.HOMDEL, + newGeneEntrezId, CNA.HOMDEL + ), + newSampleEntrezGeneIdToCnaAlteration); + Map updatedSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(updateSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 10000l, CNA.HOMDEL, + 207l, CNA.AMP, + 3845l, CNA.AMP, + //FIXME + //absentGeneEntrezId, CNA.HOMDEL, + 673l, CNA.HOMDEL, + newGeneEntrezId, CNA.AMP + ), + updatedSampleEntrezGeneIdToCnaAlteration); } - private void assertPriorDataState(HashMap> beforeResult) { - assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); - assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); + private void assertPriorDataState(HashMap> beforeResult, Set expectedGeneEntrezIds, Set expectedSampleIds) { + assertEquals(expectedGeneEntrezIds, beforeResult.keySet()); beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); - assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); + assertEquals("Samples for gene with entrez_id = " + entrezGeneId + " have to match expected ones", + expectedSampleIds, beforeResult.get(entrezGeneId).keySet()); }); } - private void assertNoChange(HashMap> beforeResult, HashMap> afterResult) { - assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); - afterResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); - if (entrezGeneId == NEW_GENE_ENTREZ_ID || entrezGeneId == ABSENT_GENE_ENTREZ_ID) { - return; - } - sampleIdToValue.forEach((sampleId, value) -> { - if (sampleId == NEW_SAMPLE_ID || sampleId == UPDATED_SAMPLE_ID) { - return; - } - assertEquals("The associated value is not expected change associated sample id " + sampleId + " and entrez gene id " + entrezGeneId, + private void assertNoChange(HashMap> beforeResult, + HashMap> afterResult, + Set geneEntrezIds, + Set sampleIds) { + geneEntrezIds.forEach(entrezGeneId -> { + assertTrue("After result is expected to contain entrez_id=" + entrezGeneId, + afterResult.containsKey(entrezGeneId)); + sampleIds.forEach(sampleId -> { + assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entrez_id=" + entrezGeneId, + afterResult.get(entrezGeneId).containsKey(sampleId)); + assertEquals("The values for sample_id=" + sampleId + + " and entrez_id=" + entrezGeneId + " before and after upload have to match.", beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); }); }); diff --git a/src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt b/src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt new file mode 100644 index 00000000..7664e868 --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt @@ -0,0 +1,17 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-XX-0800-01 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL TCGA-A1-A0SO-01 +AKT3 10000 0 -2 -2 +AKT1 207 -1 2 2 +# All after the pipe has to be removed +AKT2|TEST 208 -2 2 -1 +HRAS 3265 2 2 0 +KRAS 3845 0 -2 2 +# This gene absent in this file, but it's still part of the profile and has to be updated +#ATM 472 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 -2 -2 -1 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 2 2 0 +BRAF 673 2 -2 -2 +BRCA2 675 -1.5 2 0 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 -2 -2 2 diff --git a/src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt b/src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt new file mode 100644 index 00000000..3fbcfc58 --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt @@ -0,0 +1,7 @@ +SAMPLE_ID Entrez_Gene_Id cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +TCGA-A1-A0SO-01 3845 Putative_Passenger Test passenger Class 2 Class annotation +TCGA-A1-A0SO-01 208 Putative_Driver Test driver Class 1 Class annotation +TCGA-A1-A0SO-01 983 Putative_Passenger Test passenger +TCGA-XX-0800-01 3845 Class 2 Class annotation +TCGA-XX-0800-01 208 Class 1 Class annotation +TCGA-XX-0800-01 983 Putative_Driver From 03f966025e5d6fa6c2cf42115e6e6d01c381e0fe Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 11:42:08 +0200 Subject: [PATCH 052/130] Blanken values for genes not mentioned in the file --- .../mskcc/cbio/portal/scripts/ImportTabDelimData.java | 8 ++------ .../incremental/TestIncrementalTabDelimData.java | 9 +++------ .../tab_delim_data/data_expression_Zscores.txt | 2 +- .../resources/incremental/tab_delim_data/data_rppa.txt | 2 +- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 86c5fef8..3cfeaffc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -388,12 +388,8 @@ private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { // Expand remaining genetic entity id rows that were not mentioned in the file new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { try { - String[] values = orderedImportedSampleList.stream() - .map(sampleId -> - geneticAlterationMap.get(geneticEntityId).containsKey(sampleId) ? - geneticAlterationMap.get(geneticEntityId).get(sampleId) : "") - .toArray(String[]::new); - + String[] values = new String[orderedImportedSampleList.size()]; + Arrays.fill(values, ""); saveValues(geneticEntityId, values); } catch (DaoException e) { throw new RuntimeException(e); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index 64c4f4a6..ddea3269 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -28,7 +28,6 @@ import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.CnaEvent; -import org.mskcc.cbio.portal.model.GeneticEventImpl; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.springframework.test.annotation.Rollback; @@ -126,7 +125,7 @@ public void testMrnaExpression() throws DaoException, IOException { assertEquals("-0.6412", newGeneRow.get(updateSampleId)); HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); assertEquals("", absentGeneRow.get(newSampleId)); - assertEquals("-1.12475", absentGeneRow.get(updateSampleId)); + assertEquals("", absentGeneRow.get(updateSampleId)); } /** @@ -186,7 +185,7 @@ public void testRppa() throws DaoException, IOException { assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); - assertEquals("-1.129", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); } /** @@ -260,7 +259,7 @@ public void testDiscreteCNA() throws DaoException, IOException { assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); - assertEquals("1", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), afterResult.keySet(), @@ -300,8 +299,6 @@ public void testDiscreteCNA() throws DaoException, IOException { 10000l, CNA.HOMDEL, 207l, CNA.AMP, 3845l, CNA.AMP, - //FIXME - //absentGeneEntrezId, CNA.HOMDEL, 673l, CNA.HOMDEL, newGeneEntrezId, CNA.AMP ), diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index d7b646b0..dc189cec 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -5,7 +5,7 @@ AKT1 207 0.785 0.1 0.0426 AKT2|TEST 208 1.0741 0.1 0.718 HRAS 3265 -0.1735 0.1 -0.6412 # This gene absent in this file, but it's still part of the profile and has to be updated -#ARAF 369 0.6393 0.1 0.5377 +#ARAF 369 KRAS 3845 0.785 0.1 0.0426 ATM 472 1.0741 0.1 0.718 # This line missing the hugo symbol and the gene has to be detected by entrez id diff --git a/src/test/resources/incremental/tab_delim_data/data_rppa.txt b/src/test/resources/incremental/tab_delim_data/data_rppa.txt index bc3b858a..0953ce99 100644 --- a/src/test/resources/incremental/tab_delim_data/data_rppa.txt +++ b/src/test/resources/incremental/tab_delim_data/data_rppa.txt @@ -6,7 +6,7 @@ AKT1 AKT2 AKT3|akt1 1.61253243664957 -0.141077088398489 AKT2|TEST 5.4424238579025E-05 0.062264661774981 HRAS|hras 0.37624053370992 0.270399126328659 # This gene absent in this file, but it's still part of the profile and has to be updated 0.407622077164699 -0.326522823583974 -#ARAF 0.383702820778609 0.218650367364756 +#ARAF KRAS|kras -0.335040546938807 0.00730643372831408 ATM|atm 0.037186254715365 1.26122710480548 # This line missing the entrez id and the gene has to be detected by hugo symbol 0.062264661774981 5.4424238579025E-05 From 93cc6ffa80f8c7f0e0fad42b4775be384f3d8709 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 11:52:03 +0200 Subject: [PATCH 053/130] Remove unused code --- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 3cfeaffc..fd658f0a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -219,7 +219,6 @@ public void importData() throws IOException, DaoException { sampleIds = new String[headerParts.length - sampleStartIndex]; System.arraycopy(headerParts, sampleStartIndex, sampleIds, 0, headerParts.length - sampleStartIndex); - int nrUnknownSamplesAdded = 0; ProgressMonitor.setCurrentMessage(" --> total number of samples: " + sampleIds.length); Map, Map> pdAnnotationsForStableSampleIds = null; @@ -261,9 +260,6 @@ public void importData() throws IOException, DaoException { if (pdAnnotationsForStableSampleIds != null && !pdAnnotationsForStableSampleIds.keySet().isEmpty()) { ProgressMonitor.logWarning("WARNING: Following pd annotation sample-entrezId pairs newer used in the data file: " + pdAnnotationsForStableSampleIds.keySet()); } - if (nrUnknownSamplesAdded > 0) { - ProgressMonitor.logWarning("WARNING: Number of samples added on the fly because they were missing in clinical data: " + nrUnknownSamplesAdded); - } if (samplesSkipped > 0) { ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + samplesSkipped); } From 842bcd3c9ff867667206eab7ce11f4ad78e42c62 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 14:52:19 +0200 Subject: [PATCH 054/130] Throw unsupported operation exception for GENESET_SCORE incremental upload --- .../portal/scripts/ImportTabDelimData.java | 5 +++++ .../TestIncrementalTabDelimData.java | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index fd658f0a..d277cb95 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -153,6 +153,11 @@ public ImportTabDelimData( this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); this.daoGene = daoGene; this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); + if (this.updateMode + && geneticProfile != null + && this.geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENESET_SCORE) { + throw new UnsupportedOperationException("Incremental upload of geneset scores is not supported."); + } } /** diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index ddea3269..ec82dcc9 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -28,6 +28,7 @@ import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticAlterationType; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.springframework.test.annotation.Rollback; @@ -47,6 +48,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; /** @@ -305,6 +307,25 @@ public void testDiscreteCNA() throws DaoException, IOException { updatedSampleEntrezGeneIdToCnaAlteration); } + @Test + public void testGsvaIsNotSupported() throws DaoException, IOException { + GeneticProfile gsvaProfile = new GeneticProfile(); + gsvaProfile.setCancerStudyId(DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub").getInternalId()); + gsvaProfile.setStableId("gsva_scores"); + gsvaProfile.setDatatype("GENESET_SCORE"); + gsvaProfile.setGeneticAlterationType(GeneticAlterationType.GENESET_SCORE); + gsvaProfile.setProfileName("gsva test platform"); + DaoGeneticProfile.addGeneticProfile(gsvaProfile); + + assertThrows(UnsupportedOperationException.class, () -> + new ImportTabDelimData(File.createTempFile("gsva", "test"), + DaoGeneticProfile.getGeneticProfileByStableId("gsva_scores").getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance())); + } + private void assertPriorDataState(HashMap> beforeResult, Set expectedGeneEntrezIds, Set expectedSampleIds) { assertEquals(expectedGeneEntrezIds, beforeResult.keySet()); beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { From 22b688aaeeab8cea4744f0e345e28cabcc5c1e60 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 16:45:57 +0200 Subject: [PATCH 055/130] Add generic assay data incremental upload test --- .../TestIncrementalTabDelimData.java | 117 +++++++++++++++--- .../tab_delim_data/data_treatment_ic50.txt | 8 ++ src/test/resources/seed_mini.sql | 12 ++ 3 files changed, 122 insertions(+), 15 deletions(-) create mode 100644 src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index ec82dcc9..4f4b2aef 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -18,6 +18,7 @@ package org.mskcc.cbio.portal.integrationTest.incremental; import org.cbioportal.model.CNA; +import org.jetbrains.annotations.NotNull; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; @@ -26,6 +27,7 @@ import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.CnaEvent; import org.mskcc.cbio.portal.model.GeneticAlterationType; @@ -48,6 +50,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; @@ -326,27 +329,111 @@ public void testGsvaIsNotSupported() throws DaoException, IOException { DaoGeneOptimized.getInstance())); } - private void assertPriorDataState(HashMap> beforeResult, Set expectedGeneEntrezIds, Set expectedSampleIds) { - assertEquals(expectedGeneEntrezIds, beforeResult.keySet()); - beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("Samples for gene with entrez_id = " + entrezGeneId + " have to match expected ones", - expectedSampleIds, beforeResult.get(entrezGeneId).keySet()); + /** + * Test incremental upload of GENERIC_ASSAY + */ + @Test + public void testGenericAssay() throws DaoException, IOException { + /** + * Prior checks + */ + // Stable id that is part of the platform, but absent during the incremental upload + final String absentStableId = "L-685458"; + final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); + final Set beforeStableIds = new HashSet<>(noChangeStableIds); + beforeStableIds.add(absentStableId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + // stable_id: TCGA-A1-A0SE-01 + final int noChangeSampleId = 3; + final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); + + GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + assertNotNull(ic50Profile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); + assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_treatment_ic50.txt"); + + /** + * Test + */ + new ImportTabDelimData( + dataFile, + null, + ic50Profile.getGeneticProfileId(), + null, + "NAME,DESCRIPTION,URL", + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance()).importData(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + assertEquals("After result should have the same amount of entries", beforeResult.size(), afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, geneStableIdsToEntityIds(noChangeStableIds), Set.of(noChangeSampleId)); + int erlotinibEntityId = geneStableIdToEntityId("Erlotinib"); + assertEquals(">8", afterResult.get(erlotinibEntityId).get(newSampleId)); + assertEquals("7.5", afterResult.get(erlotinibEntityId).get(updateSampleId)); + int irinotecanEntityId = geneStableIdToEntityId("Irinotecan"); + assertEquals("", afterResult.get(irinotecanEntityId).get(newSampleId)); + assertEquals("0.081", afterResult.get(irinotecanEntityId).get(updateSampleId)); + int absentEntityId = geneStableIdToEntityId(absentStableId); + assertEquals("", afterResult.get(absentEntityId).get(newSampleId)); + assertEquals("", afterResult.get(absentEntityId).get(updateSampleId)); + int lapatinibEntityId = geneStableIdToEntityId("Lapatinib"); + assertEquals("6.2", afterResult.get(lapatinibEntityId).get(newSampleId)); + assertEquals("7.848", afterResult.get(lapatinibEntityId).get(updateSampleId)); + assertNull("No new generic entity has been added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); + } + + @NotNull + private Set geneStableIdsToEntityIds(Set beforeStableIds) { + return beforeStableIds.stream().map(stableId -> { + try { + return geneStableIdToEntityId(stableId); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toSet()); + } + + private int geneStableIdToEntityId(String stableId) throws DaoException { + return DaoGeneticEntity.getGeneticEntityByStableId(stableId).getId(); + } + + private void assertPriorDataState(HashMap> beforeResult, Set expectedEntityIds, Set expectedSampleIds) { + assertEquals(expectedEntityIds, beforeResult.keySet()); + beforeResult.forEach((entityId, sampleIdToValue) -> { + assertEquals("Samples for gene with entityId = " + entityId + " have to match expected ones", + expectedSampleIds, beforeResult.get(entityId).keySet()); }); } - private void assertNoChange(HashMap> beforeResult, - HashMap> afterResult, - Set geneEntrezIds, + private void assertNoChange(HashMap> beforeResult, + HashMap> afterResult, + Set entityIds, Set sampleIds) { - geneEntrezIds.forEach(entrezGeneId -> { - assertTrue("After result is expected to contain entrez_id=" + entrezGeneId, - afterResult.containsKey(entrezGeneId)); + entityIds.forEach(entityId -> { + assertTrue("After result is expected to contain entityId=" + entityId, + afterResult.containsKey(entityId)); sampleIds.forEach(sampleId -> { - assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entrez_id=" + entrezGeneId, - afterResult.get(entrezGeneId).containsKey(sampleId)); + assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entityId=" + entityId, + afterResult.get(entityId).containsKey(sampleId)); assertEquals("The values for sample_id=" + sampleId + - " and entrez_id=" + entrezGeneId + " before and after upload have to match.", - beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); + " and entityId=" + entityId + " before and after upload have to match.", + beforeResult.get(entityId).get(sampleId), afterResult.get(entityId).get(sampleId)); }); }); } diff --git a/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt b/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt new file mode 100644 index 00000000..5edb7cfa --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt @@ -0,0 +1,8 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 7.5 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan 0.081 +# The database has this entity, but not the file +#L-685458 +Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib 6.2 7.848 +#The line will be skipped as the entity stable id is not in the database already +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 0.1 >~8 diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index 545c85bd..1222ce67 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -199,6 +199,11 @@ INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYP INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); SET @max_entity_id = (Select MAX(ID) from genetic_entity); INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,2261,'FGFR3','protein-coding'); +-- Generic genetic entities +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Erlotinib'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Irinotecan'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'L-685458'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Lapatinib'); -- cna_event INSERT INTO "cna_event" ("CNA_EVENT_ID","ENTREZ_GENE_ID","ALTERATION") VALUES (20093,207,-2); @@ -329,6 +334,7 @@ INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (7,'study_tcga_pub_structural_variants',1,'STRUCTURAL_VARIANT','SV','Structural Variants','Structural Variants test data.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (8,'study_tcga_pub_cna_long',1,'COPY_NUMBER_ALTERATION','DISCRETE_LONG','CNA values','CNA values dummy data','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (9,'study_tcga_pub_rppa',1,'PROTEIN_LEVEL','LOG2-VALUE','RPPA values','RPPA values dummy data','0'); +INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (10,'study_tcga_pub_treatment_ic50',1,'GENERIC_ASSAY','TREATMENT_RESPONSE','test treatment values','treatment values dummy data','0'); -- gene_panel INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_CNA_DISCRETE_LONG_FORMAT','Some test panel'); @@ -388,6 +394,11 @@ INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALU INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 672),'-1.661,-1.392,-1.924,-1.656,-0.361,-1.998,-0.136,-0.709,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 673),'0.233,0.561,-0.106,-0.085,-0.012,0.143,0.141,0.609,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 675),'-0.570,-1.340,-1.544,-0.404,0.632,-1.231,0.771,-0.036,'); +-- Generic assay data +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Erlotinib'),'5.2,>8,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Irinotecan'),'>8,7.1,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'L-685458'),'>4.6,7.2,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Lapatinib'),'6.9,>~8,'); -- genetic_profile_samples INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (2,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); @@ -395,6 +406,7 @@ INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (4,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (5,'2,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (9,'2,3,6,8,9,10,12,13,'); +INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (10,'2,3,'); -- patient INSERT INTO "patient" ("INTERNAL_ID","STABLE_ID","CANCER_STUDY_ID") VALUES (1,'TCGA-A1-A0SB',1); From d11a353ff70f0a3f1370b168f6600ce90edb5d7a Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 17:52:00 +0200 Subject: [PATCH 056/130] Fix integration tests --- .../portal/scripts/ImportTabDelimData.java | 22 +++++++++---------- .../dao/TestDaoGeneticProfile.java | 10 ++++----- .../TestImportCnaDiscreteLongData.java | 14 ++++++------ .../scripts/TestImportGenericAssayData.java | 5 +++-- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index d277cb95..8df2ed8b 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -667,7 +667,7 @@ private boolean saveLine(String[] values, if (!microRNAGenes.isEmpty()) { // for micro rna, duplicate the data for (CanonicalGene gene : microRNAGenes) { - if (this.saveValues(gene, values)) { + if (this.saveValues(gene, values, geneSymbol)) { recordStored = true; } } @@ -687,7 +687,7 @@ private boolean saveLine(String[] values, // none of the matched genes are type "miRNA" if (genes.size() == 1) { // Store all values per gene: - recordStored = this.saveValues(genes.get(0), values); + recordStored = this.saveValues(genes.get(0), values, geneSymbol); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored && isDiscretizedCnaProfile) { if (updateMode) { @@ -698,7 +698,7 @@ private boolean saveLine(String[] values, } } else { if (isRppaProfile) { // for protein data, duplicate the data - recordStored = saveRppaValues(values, recordStored, genes); + recordStored = saveRppaValues(values, recordStored, genes, geneSymbol); } else { if (!recordStored) { // this case : @@ -711,16 +711,14 @@ private boolean saveLine(String[] values, return recordStored; } - private boolean saveValues(CanonicalGene canonicalGene, String[] values) throws DaoException { - //TODO Think of better way. We do that to do not remove genes that contain duplicate - if (geneticAlterationImporter.isImportedAlready(canonicalGene)) { - return false; - } + private boolean saveValues(CanonicalGene canonicalGene, String[] values, String geneSymbol) throws DaoException { if (updateMode) { values = updateValues(canonicalGene.getGeneticEntityId(), values); - daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); + if (!geneticAlterationImporter.isImportedAlready(canonicalGene)) { + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); + } } - return geneticAlterationImporter.store(values, canonicalGene, canonicalGene.getHugoGeneSymbolAllCaps()); + return geneticAlterationImporter.store(values, canonicalGene, geneSymbol); } //TODO unify saveValues versions // With update mode the last duplicate wins. It's different from the other function @@ -753,9 +751,9 @@ private String[] updateValues(int geneticEntityId, String[] values) { return updatedSampleValues; } - private boolean saveRppaValues(String[] values, boolean recordStored, List genes) throws DaoException { + private boolean saveRppaValues(String[] values, boolean recordStored, List genes, String geneSymbol) throws DaoException { for (CanonicalGene gene : genes) { - if (this.saveValues(gene, values)) { + if (this.saveValues(gene, values, geneSymbol)) { recordStored = true; nrExtraRecords++; } diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java index 8c1afdcc..83e04144 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java @@ -72,7 +72,7 @@ public void setUp() throws DaoException public void testDaoGetAllGeneticProfiles() throws DaoException { ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(7, list.size()); + assertEquals(9, list.size()); } @Test @@ -134,12 +134,12 @@ public void testDaoDeleteGeneticProfile() throws DaoException { GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(2); - assertEquals(7, DaoGeneticProfile.getCount()); + assertEquals(9, DaoGeneticProfile.getCount()); DaoGeneticProfile.deleteGeneticProfile(geneticProfile); - assertEquals(6, DaoGeneticProfile.getCount()); + assertEquals(8, DaoGeneticProfile.getCount()); ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(6, list.size()); + assertEquals(8, list.size()); geneticProfile = list.get(0); assertEquals(studyId, geneticProfile.getCancerStudyId()); assertEquals("mRNA expression (microarray)", geneticProfile.getProfileName()); @@ -155,7 +155,7 @@ public void testDaoUpdateGeneticProfile() throws DaoException { geneticProfile.getGeneticProfileId(), "Updated Name", "Updated Description")); ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(7, list.size()); + assertEquals(9, list.size()); geneticProfile = list.get(0); assertEquals(studyId, geneticProfile.getCancerStudyId()); assertEquals("Updated Name", geneticProfile.getProfileName()); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java index d317aa03..916a16cd 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java @@ -180,7 +180,7 @@ public void testImportCnaDiscreteLongDataAddsCnaEvents() throws Exception { @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterations() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -205,7 +205,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterations() throws Excepti @Test public void testImportCnaDiscreteLongDataAddsMissingGeneticAlterations() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_with_cna_events_missing.txt"); new ImportCnaDiscreteLongData( @@ -233,7 +233,7 @@ public void testImportCnaDiscreteLongDataAddsMissingGeneticAlterations() throws @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterationsAndProfileSamplesInCorrectOrder() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -260,7 +260,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsAndProfileSamples @Test public void testImportCnaDiscreteLongDataHandlesEntriesWithoutEntrezButWithHugo() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_without_entrez_with_hugo.txt"); new ImportCnaDiscreteLongData( @@ -283,7 +283,7 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithoutEntrezButWithHugo( @Test public void testImportCnaDiscreteLongDataHandlesEntriesWithWrongEntrezAndCorrectHugo() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_with_wrong_entrez_and_correct_hugo.txt"); new ImportCnaDiscreteLongData( @@ -306,7 +306,7 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithWrongEntrezAndCorrect @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterationsFromNonCnaEvents() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -334,7 +334,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsFromNonCnaEvents( @Test public void testImportCnaDiscreteLongDataIgnoresLineWithDuplicateGene() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java index a0a33c6d..fa7e0449 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java @@ -95,10 +95,11 @@ public void testImportGenericAssayData() throws Exception { // Open mutational signature test data file File file = new File("src/test/resources/data_mutational_signature.txt"); - + int numRecordsForGenericAssayBefore = getNumRecordsForGenericAssay(); + // import data and test all mutational signatures were added ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "name,description", false); - assertEquals(60, getNumRecordsForGenericAssay()); + assertEquals(numRecordsForGenericAssayBefore + 60, getNumRecordsForGenericAssay()); // test wether a record can be retrieved via stable id GenericAssayMeta genericAssayMeta1 = DaoGenericAssay.getGenericAssayMetaByStableId("mean_1"); From 7dfb1bd5f6139221a8553555ac577fa54adce817 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 9 May 2024 15:19:58 +0200 Subject: [PATCH 057/130] Make tab. delimiter data uploader transactional --- .../org/mskcc/cbio/portal/dao/JdbcUtil.java | 21 +++- .../portal/scripts/ImportTabDelimData.java | 15 ++- ...estIncrementalTabDelimDataTransaction.java | 119 ++++++++++++++++++ 3 files changed, 149 insertions(+), 6 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java diff --git a/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java b/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java index 40f9e9ed..48f59d70 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java @@ -35,10 +35,12 @@ import java.sql.*; import java.util.*; import javax.sql.DataSource; -import org.apache.commons.dbcp2.BasicDataSource; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.mskcc.cbio.portal.util.*; +import org.springframework.jdbc.datasource.DataSourceTransactionManager; +import org.springframework.jdbc.datasource.TransactionAwareDataSourceProxy; +import org.springframework.transaction.support.TransactionTemplate; /** * Connection Utility for JDBC. @@ -50,6 +52,8 @@ public class JdbcUtil { private static DataSource dataSource; private static Map activeConnectionCount = new HashMap(); // keep track of the number of active connection per class/requester private static final Logger LOG = LoggerFactory.getLogger(JdbcUtil.class); + private static DataSourceTransactionManager transactionManager; + private static TransactionTemplate transactionTemplate; /** * Gets the data source @@ -57,17 +61,28 @@ public class JdbcUtil { */ public static DataSource getDataSource() { if (dataSource == null) { - dataSource = new JdbcDataSource(); + dataSource = new TransactionAwareDataSourceProxy(new JdbcDataSource()); + initSpringTx(); } return dataSource; } + private static void initSpringTx() { + transactionManager = new DataSourceTransactionManager(dataSource); + transactionTemplate = new TransactionTemplate(transactionManager); + } + /** * Sets the data source * @param value the data source */ public static void setDataSource(DataSource value) { dataSource = value; + initSpringTx(); + } + + public static TransactionTemplate getTransactionTemplate() { + return transactionTemplate; } /** diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 8df2ed8b..4a118bcc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -163,10 +163,19 @@ public ImportTabDelimData( /** * Import the Copy Number Alteration, mRNA Expression, protein RPPA, GSVA or generic_assay data * - * @throws IOException IO Error. - * @throws DaoException Database Error. */ - public void importData() throws IOException, DaoException { + public void importData() { + JdbcUtil.getTransactionTemplate().execute(status -> { + try { + doImportData(); + } catch (Throwable e) { + status.setRollbackOnly(); + throw new RuntimeException(e); + } + return null; + }); + } + private void doImportData() throws IOException, DaoException { try { this.numLines = FileUtil.getNumLines(dataFile); } catch (IOException e) { diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java new file mode 100644 index 00000000..f149d959 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java @@ -0,0 +1,119 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.cbioportal.model.CNA; +import org.jetbrains.annotations.NotNull; +import org.junit.Before; +import org.junit.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.runner.RunWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportTabDelimData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Propagation; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.mock; + +/** + * Tests Transaction for Incremental Import of Tab Delimited Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +public class TestIncrementalTabDelimDataTransaction { + + /** + * Test transaction + */ + @Test + @ExtendWith(MockitoExtension.class) + //Mysql does not support nested transactions. That's why we disable the outer transaction. + @Transactional(propagation = Propagation.NOT_SUPPORTED) + public void testTransaction() throws Exception { + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + + DaoGeneticAlteration mockedDao = mock(DaoGeneticAlteration.class); + + doNothing().doNothing().doThrow(new DaoException("Simulated dao error")) + .when(mockedDao).deleteAllRecordsInGeneticProfile(anyLong(), anyLong()); + /** + * Test + */ + try { + new ImportTabDelimData(dataFile, + mrnaProfile.getGeneticProfileId(), + null, + true, + mockedDao, + DaoGeneOptimized.getInstance()).importData(); + fail("Import has to fail"); + } catch (RuntimeException runtimeException) { + assertTrue(true); + } + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals(beforeResult, afterResult); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } +} From 71cdf704ae1336b16d8034c6479caac184ba2da6 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 9 May 2024 15:44:03 +0200 Subject: [PATCH 058/130] Check for illegal state in tab delim. data update It's dangerous as we would further mess up the data in the row --- .../cbio/portal/scripts/ImportTabDelimData.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 4a118bcc..17ba7104 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -421,6 +421,8 @@ private void ensureSampleGeneticProfile(Sample sample) throws DaoException { private void saveOrderedSampleList() throws DaoException { if (updateMode) { ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(geneticProfileId); + int initialOrderSampleListSize = savedOrderedSampleList.size(); + checkSamplesInDataEqualTo(initialOrderSampleListSize); // add all new sample ids at the end ArrayList extendedSampleList = new ArrayList<>(savedOrderedSampleList); List newSampleIds = orderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); @@ -434,6 +436,17 @@ private void saveOrderedSampleList() throws DaoException { DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); } + private void checkSamplesInDataEqualTo(int initialOrderSampleListSize) { + geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { + if (sampleToValue.size() != initialOrderSampleListSize) { + throw new IllegalStateException("Number of samples (" + + sampleToValue.size() + ") for genetic entity with id " + + geneticEntityId + " does not match with the number in the inital sample list (" + + initialOrderSampleListSize + ")."); + } + }); + } + //TODO move somewhere else private Map zip(K[] keys, V[] values) { Map map = new HashMap<>(); From 2d31dac87f676cc95eb432463ed841a14cf678f0 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 9 May 2024 15:52:21 +0200 Subject: [PATCH 059/130] Wire incremental tab delim. data upload to cli commands --- scripts/importer/cbioportalImporter.py | 2 ++ scripts/importer/cbioportal_common.py | 11 ++++++++++- .../mskcc/cbio/portal/scripts/ImportProfileData.java | 4 ++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index f52bbc6a..ea9cfa50 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -464,6 +464,8 @@ def import_incremental_data(jvm_args, data_directory, update_generic_assay_entit Load all data types that are available and support incremental upload """ for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: + if meta_file_type not in meta_file_type_to_meta_files: + continue meta_pairs = meta_file_type_to_meta_files[meta_file_type] for meta_pair in meta_pairs: meta_filename, meta_dictionary = meta_pair diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index eaa38a5e..798174ee 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -370,7 +370,16 @@ class MetaFileTypes(object): MetaFileTypes.PATIENT_ATTRIBUTES, MetaFileTypes.SAMPLE_ATTRIBUTES, MetaFileTypes.MUTATION, - # TODO Add more types here as incremental upload is enabled + MetaFileTypes.MUTATION_UNCALLED, + MetaFileTypes.EXPRESSION, + MetaFileTypes.CNA_DISCRETE, + MetaFileTypes.CNA_CONTINUOUS, + MetaFileTypes.CNA_LOG2, + MetaFileTypes.METHYLATION, + MetaFileTypes.PROTEIN, + MetaFileTypes.GENERIC_ASSAY_CONTINUOUS, + MetaFileTypes.GENERIC_ASSAY_BINARY, + MetaFileTypes.GENERIC_ASSAY_CATEGORICAL, ] IMPORTER_CLASSNAME_BY_META_TYPE = { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index d34ab2cc..10759baa 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -127,7 +127,7 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), - false, + overwriteExisting, daoGeneticAlteration, daoGene ); genericAssayProfileImporter.importData(); @@ -152,7 +152,7 @@ public void run() { geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, - false, + overwriteExisting, daoGeneticAlteration, daoGene ); String pdAnnotationsFilename = geneticProfile.getOtherMetaDataField("pd_annotations_filename"); From 49975427651a89fc7926647264714e82a0a5e2a5 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 10 May 2024 09:59:01 +0200 Subject: [PATCH 060/130] Expand README with section on how to run incremental upload --- README.md | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 252b7ba9..51ccaf64 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,34 @@ This repo contains: ## Inclusion in main codebase The `cbioportal-core` code is currently included in the final Docker image during the Docker build process: https://github.com/cBioPortal/cbioportal/blob/master/docker/web-and-data/Dockerfile#L48 +## Running in docker + +Build docker image with: +```bash +docker build -t cbioportal-core . +``` + +Example of how to start loading of the whole study: +```bash +docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -s /data/study_es_0 -p /data/api_json -o +``` + +### Incremental upload of data + +To add or update specific patient, sample, or molecular data in an already loaded study, you can perform an incremental upload. This process is quicker than reloading the entire study. + +To execute an incremental upload, use the -d (or --data_directory) option instead of -s (or --study_directory). Here is an example command: +```bash +docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -d /data/study_es_0_inc -p /data/api_json -o +``` +**Note:** +While the directory should adhere to the standard cBioPortal file formats and study structure, please note the following specific guidelines for incremental uploads: + +- Incremental uploads are not supported for all data types. For instance, uploading study metadata, resources, or GSVA data incrementally is currently unsupported. +- The data pertaining to patient or sample IDs should only include entries that are either new or need updates. + +This method ensures efficient updates without the need for complete study reuploads, saving time and computational resources. + ## How to run integration tests This section guides you through the process of running integration tests by setting up a cBioPortal MySQL database environment using Docker. Please follow these steps carefully to ensure your testing environment is configured correctly. @@ -119,15 +147,3 @@ The script will search for `core-*.jar` in the root of the project: python scripts/importer/metaImport.py -s tests/test_data/study_es_0 -p tests/test_data/api_json_unit_tests -o ``` -## Running in docker - -Build docker image with: -```bash -docker build -t cbioportal-core . -``` - -Example of how to start the loading: -```bash -docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -s /data/study_es_0 -p /data/api_json -o -``` - From 911ae2868b047b31402ac7e55c04e7af4a1aca3d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 10 May 2024 10:39:19 +0200 Subject: [PATCH 061/130] Address TODOs in tab delim. importer --- .../portal/scripts/ImportTabDelimData.java | 20 +----------------- .../org/mskcc/cbio/portal/util/ArrayUtil.java | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 19 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 17ba7104..cb613b08 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -447,21 +447,6 @@ private void checkSamplesInDataEqualTo(int initialOrderSampleListSize) { }); } - //TODO move somewhere else - private Map zip(K[] keys, V[] values) { - Map map = new HashMap<>(); - - // Check if both arrays have the same length - if (keys.length == values.length) { - for (int i = 0; i < keys.length; i++) { - map.put(keys[i], values[i]); - } - } else { - throw new IllegalArgumentException("Arrays must be of the same length"); - } - return map; - } - private Map, Map> readPdAnnotations(File pdAnnotationsFile) { Map, Map> pdAnnotations = new HashMap<>(); BufferedReader reader; @@ -742,8 +727,6 @@ private boolean saveValues(CanonicalGene canonicalGene, String[] values, String } return geneticAlterationImporter.store(values, canonicalGene, geneSymbol); } - //TODO unify saveValues versions - // With update mode the last duplicate wins. It's different from the other function private boolean saveValues(int geneticEntityId, String[] values) throws DaoException { if (updateMode) { daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); @@ -753,8 +736,7 @@ private boolean saveValues(int geneticEntityId, String[] values) throws DaoExcep } private String[] updateValues(int geneticEntityId, String[] values) { - //TODO swap variables - Map sampleIdToValue = zip(orderedImportedSampleList.toArray(new Integer[0]), values); + Map sampleIdToValue = ArrayUtil.zip(orderedImportedSampleList.toArray(new Integer[0]), values); String[] updatedSampleValues = new String[orderedSampleList.size()]; for (int i = 0; i < orderedSampleList.size(); i++) { updatedSampleValues[i] = ""; diff --git a/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java new file mode 100644 index 00000000..3235d33e --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java @@ -0,0 +1,21 @@ +package org.mskcc.cbio.portal.util; + +import java.util.HashMap; +import java.util.Map; + +public class ArrayUtil { + public static Map zip(K[] keys, V[] values) { + Map map = new HashMap<>(); + + // Check if both arrays have the same length + if (keys.length == values.length) { + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + } else { + throw new IllegalArgumentException("Arrays must be of the same length"); + } + return map; + + } +} \ No newline at end of file From c7343f9ca23c95d8c08729d4fc59b8268b9262ac Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 10 May 2024 11:39:02 +0200 Subject: [PATCH 062/130] Add more data types to incremental data upload folder --- tests/system_tests_import_data.py | 15 +++++++++++++++ .../study_es_0_inc/data_cna_discrete.txt | 10 ++++++++++ tests/test_data/study_es_0_inc/data_cna_log2.txt | 10 ++++++++++ .../study_es_0_inc/data_expression_median.txt | 10 ++++++++++ .../study_es_0_inc/data_methylation_hm27.txt | 10 ++++++++++ .../study_es_0_inc/data_treatment_ic50.txt | 11 +++++++++++ .../study_es_0_inc/meta_cna_discrete.txt | 10 ++++++++++ tests/test_data/study_es_0_inc/meta_cna_log2.txt | 8 ++++++++ .../study_es_0_inc/meta_expression_median.txt | 8 ++++++++ .../study_es_0_inc/meta_methylation_hm27.txt | 8 ++++++++ .../study_es_0_inc/meta_treatment_ic50.txt | 12 ++++++++++++ 11 files changed, 112 insertions(+) create mode 100644 tests/test_data/study_es_0_inc/data_cna_discrete.txt create mode 100644 tests/test_data/study_es_0_inc/data_cna_log2.txt create mode 100644 tests/test_data/study_es_0_inc/data_expression_median.txt create mode 100644 tests/test_data/study_es_0_inc/data_methylation_hm27.txt create mode 100644 tests/test_data/study_es_0_inc/data_treatment_ic50.txt create mode 100644 tests/test_data/study_es_0_inc/meta_cna_discrete.txt create mode 100644 tests/test_data/study_es_0_inc/meta_cna_log2.txt create mode 100644 tests/test_data/study_es_0_inc/meta_expression_median.txt create mode 100644 tests/test_data/study_es_0_inc/meta_methylation_hm27.txt create mode 100644 tests/test_data/study_es_0_inc/meta_treatment_ic50.txt diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index 097e6c01..64361571 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -107,6 +107,16 @@ def test_incremental_load(self, run_java, locate_jar): '--meta', f'{data_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_samples.txt', '--noprogress') mutation_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_mutations_extended.maf', '--noprogress') + cna_discrete_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_discrete.txt', '--noprogress') + cna_log2_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_log2.txt', '--noprogress') + expression_median_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_expression_median.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_expression_median.txt', '--noprogress') + methylation_hm27_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_methylation_hm27.txt', '--noprogress') + treatment_ic50_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_treatment_ic50.txt', '--noprogress') case_list_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists') @@ -115,6 +125,11 @@ def test_incremental_load(self, run_java, locate_jar): clinical_patient_call, clinical_sample_call, mutation_call, + cna_discrete_call, + cna_log2_call, + expression_median_call, + methylation_hm27_call, + treatment_ic50_call, case_list_call, ]) diff --git a/tests/test_data/study_es_0_inc/data_cna_discrete.txt b/tests/test_data/study_es_0_inc/data_cna_discrete.txt new file mode 100644 index 00000000..7915f45b --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_discrete.txt @@ -0,0 +1,10 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0 0 -1 + 375790 -1 -1 0 +ATAD3A 55210 0 0 -2 +ATAD3B 83858 -2 -1 0 +ATAD3C 219293 0 0 0 +#AURKAIP1 54998 +ERCC5 2073 0 -1 -2 +ACP3 55 0 0 0 +TP53 -1 0 -2 diff --git a/tests/test_data/study_es_0_inc/data_cna_log2.txt b/tests/test_data/study_es_0_inc/data_cna_log2.txt new file mode 100644 index 00000000..0eb820a7 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_log2.txt @@ -0,0 +1,10 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.751 0.533 0.114 + 375790 0.062 0.071 0.948 +ATAD3A 55210 0.487 0.695 0.364 +ATAD3B 83858 0.150 0.492 0.300 +ATAD3C 219293 0.995 0.170 0.654 +#AURKAIP1 54998 +ERCC5 2073 0.816 0.514 0.165 +ACP3 55 0.252 0.713 0.513 +TP53 0.360 0.538 0.891 diff --git a/tests/test_data/study_es_0_inc/data_expression_median.txt b/tests/test_data/study_es_0_inc/data_expression_median.txt new file mode 100644 index 00000000..d5c4a9a0 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_expression_median.txt @@ -0,0 +1,10 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.096 0.826 0.032 + 375790 0.309 0.399 0.680 +ATAD3A 55210 0.569 0.189 0.266 +ATAD3B 83858 0.829 0.473 0.611 +ATAD3C 219293 0.307 0.445 0.045 +#AURKAIP1 54998 +ERCC5 2073 0.171 0.766 0.590 +ACP3 55 0.422 0.870 0.745 +TP53 0.179 0.694 0.808 diff --git a/tests/test_data/study_es_0_inc/data_methylation_hm27.txt b/tests/test_data/study_es_0_inc/data_methylation_hm27.txt new file mode 100644 index 00000000..d2c67abc --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_methylation_hm27.txt @@ -0,0 +1,10 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.022 0.681 0.790 + 375790 0.435 0.340 0.321 +ATAD3A 55210 0.229 0.946 0.439 +ATAD3B 83858 0.885 0.707 0.664 +ATAD3C 219293 0.660 0.315 0.694 +#AURKAIP1 54998 +ERCC5 2073 0.436 0.749 0.345 +ACP3 55 0.622 0.396 0.029 +TP53 0.563 0.686 0.607 diff --git a/tests/test_data/study_es_0_inc/data_treatment_ic50.txt b/tests/test_data/study_es_0_inc/data_treatment_ic50.txt new file mode 100644 index 00000000..2a507cef --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_treatment_ic50.txt @@ -0,0 +1,11 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +17-AAG Name of 17-AAG Desc of 17-AAG Url of 17-AAG 0.315 0.329701692 0.053038094 +AEW541 Name of AEW541 Desc of AEW541 Url of AEW541 >8 2.353 2.68212986 +AZD0530 Name of AZD0530 Desc of AZD0530 Url of AZD0530 0.234 >8 4.597949505 +AZD6244 Name of AZD6244 Desc of AZD6244 Url of AZD6244 >8 >8 >8 +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 >8 >8 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan NA 0.083 NA +L-685458 Name of L-685458 Desc of L-685458 Url of L-685458 >8 >8 3.267752409 +#Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 NA >8 >8 +Nilotinib Name of Nilotinib Desc of Nilotinib Url of Nilotinib >8 >8 NA diff --git a/tests/test_data/study_es_0_inc/meta_cna_discrete.txt b/tests/test_data/study_es_0_inc/meta_cna_discrete.txt new file mode 100644 index 00000000..f6ea8bea --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_discrete.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete.txt +pd_annotations_filename: data_cna_pd_annotations.txt +namespaces: CustomNamespace diff --git a/tests/test_data/study_es_0_inc/meta_cna_log2.txt b/tests/test_data/study_es_0_inc/meta_cna_log2.txt new file mode 100644 index 00000000..74a07b8e --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_log2.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: LOG2-VALUE +stable_id: log2CNA +show_profile_in_analysis_tab: false +profile_description: Log2 copy-number values for each gene (from Affymetrix SNP6). +profile_name: Log2 copy-number values +data_filename: data_cna_log2.txt diff --git a/tests/test_data/study_es_0_inc/meta_expression_median.txt b/tests/test_data/study_es_0_inc/meta_expression_median.txt new file mode 100644 index 00000000..1e2fc6a7 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_expression_median.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: MRNA_EXPRESSION +datatype: CONTINUOUS +stable_id: mrna +profile_description: Expression levels (Agilent microarray). +show_profile_in_analysis_tab: false +profile_name: mRNA expression (microarray) +data_filename: data_expression_median.txt diff --git a/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt b/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt new file mode 100644 index 00000000..582b12e9 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: METHYLATION +datatype: CONTINUOUS +stable_id: methylation_hm27 +profile_description: Methylation beta-values (HM27 platform). For genes with multiple methylation probes, the probe least correlated with expression is selected. +show_profile_in_analysis_tab: false +profile_name: Methylation (HM27) +data_filename: data_methylation_hm27.txt diff --git a/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt new file mode 100644 index 00000000..0d3281cd --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt @@ -0,0 +1,12 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL \ No newline at end of file From 2ed0bd85cccd081c5eb7708b6929e894d6eb614a Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 15 May 2024 11:42:49 +0200 Subject: [PATCH 063/130] Remove obsolete TODO comment --- .../java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index fcc2380e..0358b132 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -95,7 +95,6 @@ public int addGeneticAlterations(int geneticProfileId, long entrezGeneId, String return addGeneticAlterationsForGeneticEntity(geneticProfileId, DaoGeneOptimized.getGeneticEntityId(entrezGeneId), values); } - // TODO inc: update instead public int addGeneticAlterationsForGeneticEntity(int geneticProfileId, int geneticEntityId, String[] values) throws DaoException { From 76b52a9ba3e5d1915b95b478158149f4f93b1109 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 16 May 2024 22:54:02 +0200 Subject: [PATCH 064/130] Reuse genetic_profile record if it exists in db already Do it for all data types, not only MAF --- .../portal/util/GeneticProfileReader.java | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java index af686a72..ab862756 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java +++ b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java @@ -76,22 +76,25 @@ public static GeneticProfile loadGeneticProfile(File file) throws IOException, D GeneticProfile geneticProfile = loadGeneticProfileFromMeta(file); GeneticProfile existingGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId(geneticProfile.getStableId()); if (existingGeneticProfile != null) { - if (!existingGeneticProfile.getDatatype().equals("MAF")) { - // the dbms already contains a GeneticProfile with the file's stable_id. This scenario is not supported - // anymore, so throw error telling user to remove existing profile first: - throw new RuntimeException("Error: genetic_profile record found with same Stable ID as the one used in your data: " - + existingGeneticProfile.getStableId() + ". Remove the existing genetic_profile record first."); - } else { - // For mutation data only we can have multiple files with the same genetic_profile. - // There is a constraint in the mutation database table to prevent duplicated data - // If this constraint is hit (mistakenly importing the same maf twice) MySqlBulkLoader will throw an exception - // - // make an object combining the pre-existing profile with the file-specific properties of the current file - GeneticProfile gp = new GeneticProfile(existingGeneticProfile); - gp.setTargetLine(gp.getTargetLine()); - gp.setOtherMetadataFields(gp.getAllOtherMetadataFields()); - return gp; + ProgressMonitor.setCurrentMessage("genetic_profile record found with same Stable ID (" + geneticProfile.getStableId() + + "). Using it instead."); + if (geneticProfile.getGeneticAlterationType() != existingGeneticProfile.getGeneticAlterationType()) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different genetic alteration type: " + + existingGeneticProfile.getGeneticProfileId()); } + if (!existingGeneticProfile.getDatatype().equals(geneticProfile.getDatatype())) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different data type: " + + existingGeneticProfile.getDatatype()); + } + if (geneticProfile.getCancerStudyId() != existingGeneticProfile.getCancerStudyId()) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different cancer study (id=" + + existingGeneticProfile.getCancerStudyId() + ")"); + } + existingGeneticProfile.setOtherMetadataFields(geneticProfile.getAllOtherMetadataFields()); + return existingGeneticProfile; } // For GSVA profiles, we want to create a geneticProfileLink from source_stable_id for: From fa160767b96376271a1cddc4a9cfb56c0c10159e Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 00:16:31 +0200 Subject: [PATCH 065/130] Test incremental upload of tab delim. data types from umbrella script - Split big tab. delim test to multiple tests based on data type. - Use ImportProfileData instead of ImportTabDelimData for testing. - We cover more logic with such tests. - This is more stable interface. ImportTabDelimData can be refactored. --- .../scripts/ImportGenericAssayEntity.java | 8 +- .../GeneticAlterationsTestHelper.java | 55 +++ ...IncrementalCopyNumberAlterationImport.java | 177 +++++++ .../TestIncrementalGenericAssayImporter.java | 136 ++++++ .../TestIncrementalGsvaImporter.java | 81 ++++ .../TestIncrementalMrnaExpressionImport.java | 119 +++++ .../TestIncrementalProteinLevelImport.java | 122 +++++ .../TestIncrementalTabDelimData.java | 441 ------------------ .../data_cna_discrete.txt | 0 .../data_cna_pd_annotations.txt | 0 .../meta_cna_discrete.txt | 10 + .../data_treatment_ic50.txt | 2 +- .../generic_assay/meta_treatment_ic50.txt | 12 + .../data_expression_Zscores.txt | 0 .../meta_expression_Zscores.txt | 8 + .../data_rppa.txt | 0 .../incremental/protein_level/meta_rppa.txt | 7 + src/test/resources/seed_mini.sql | 2 +- 18 files changed, 736 insertions(+), 444 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java delete mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java rename src/test/resources/incremental/{tab_delim_data => copy_number_alteration}/data_cna_discrete.txt (100%) rename src/test/resources/incremental/{tab_delim_data => copy_number_alteration}/data_cna_pd_annotations.txt (100%) create mode 100644 src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt rename src/test/resources/incremental/{tab_delim_data => generic_assay}/data_treatment_ic50.txt (83%) create mode 100644 src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt rename src/test/resources/incremental/{tab_delim_data => mrna_expression}/data_expression_Zscores.txt (100%) create mode 100644 src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt rename src/test/resources/incremental/{tab_delim_data => protein_level}/data_rppa.txt (100%) create mode 100644 src/test/resources/incremental/protein_level/meta_rppa.txt diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java index 2da0ebd2..7da2e983 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java @@ -40,9 +40,11 @@ import java.io.File; import java.io.FileReader; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import org.cbioportal.model.EntityType; import org.cbioportal.model.GenericEntityProperty; @@ -50,6 +52,7 @@ import org.mskcc.cbio.portal.dao.DaoGenericAssay; import org.mskcc.cbio.portal.dao.DaoGeneticEntity; import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import joptsimple.OptionParser; @@ -160,7 +163,6 @@ public static void startImport(OptionSet options, OptionSpec data, Optio * @throws Exception */ public static void importData(File dataFile, GeneticAlterationType geneticAlterationType, String additionalProperties, boolean updateInfo) throws Exception { - ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getCanonicalPath()); // read generic assay data file @@ -186,6 +188,10 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera currentLine = buf.readLine(); while (currentLine != null) { + if (!FileUtil.isInfoLine(currentLine)) { + currentLine = buf.readLine(); + continue; + } String[] parts = currentLine.split("\t"); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java new file mode 100644 index 00000000..fdf36995 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java @@ -0,0 +1,55 @@ +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.jetbrains.annotations.NotNull; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; + +import java.util.HashMap; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class GeneticAlterationsTestHelper { + @NotNull + public static Set geneStableIdsToEntityIds(Set beforeStableIds) { + return beforeStableIds.stream().map(stableId -> { + try { + return geneStableIdToEntityId(stableId); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toSet()); + } + + public static int geneStableIdToEntityId(String stableId) throws DaoException { + return DaoGeneticEntity.getGeneticEntityByStableId(stableId).getId(); + } + + public static void assertPriorDataState(HashMap> beforeResult, Set expectedEntityIds, Set expectedSampleIds) { + assertEquals(expectedEntityIds, beforeResult.keySet()); + beforeResult.forEach((entityId, sampleIdToValue) -> { + assertEquals("Samples for gene with entityId = " + entityId + " have to match expected ones", + expectedSampleIds, beforeResult.get(entityId).keySet()); + }); + } + + public static void assertNoChange(HashMap> beforeResult, + HashMap> afterResult, + Set entityIds, + Set sampleIds) { + entityIds.forEach(entityId -> { + assertTrue("After result is expected to contain entityId=" + entityId, + afterResult.containsKey(entityId)); + sampleIds.forEach(sampleId -> { + assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entityId=" + entityId, + afterResult.get(entityId).containsKey(sampleId)); + assertEquals("The values for sample_id=" + sampleId + + " and entityId=" + entityId + " before and after upload have to match.", + beforeResult.get(entityId).get(sampleId), afterResult.get(entityId).get(sampleId)); + }); + }); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java new file mode 100644 index 00000000..ad3ebd55 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java @@ -0,0 +1,177 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.cbioportal.model.CNA; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of PROTEIN_LEVEL Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +@Rollback +@Transactional +public class TestIncrementalCopyNumberAlterationImport { + + /** + * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE (gistic) + */ + @Test + public void testDiscreteCNA() throws DaoException, IOException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ATM + final long absentGeneEntrezId = 472l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-XX-0800 + final int newSampleId = 15; + // stable_id: TCGA-A1-A0SO + final int updateSampleId = 12; + final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + final Set afterSampleIds = new HashSet<>(beforeSampleIds); + afterSampleIds.add(newSampleId); + + GeneticProfile discreteCNAProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); + assertNotNull(discreteCNAProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + List allCnaLevels = Arrays.stream(CNA.values()).map(CNA::getCode).toList(); + Set beforeCnaEventsSampleIds = Set.of(4, 13, 14, updateSampleId); + List beforeSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + null, + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> beforeSampleIdToSampleCnaEvents = beforeSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals(beforeCnaEventsSampleIds, beforeSampleIdToSampleCnaEvents.keySet()); + + File dataFolder = new File("src/test/resources/incremental/copy_number_alteration/"); + File metaFile = new File(dataFolder, "meta_cna_discrete.txt"); + File dataFile = new File(dataFolder, "data_cna_discrete.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + + List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + afterResult.keySet(), + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> afterSampleIdToSampleCnaEvents = afterSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals("There is only one new sample that has to gain cna events", beforeCnaEventsSampleIds.size() + 1, afterSampleIdToSampleCnaEvents.size()); + beforeCnaEventsSampleIds.forEach(sampleId -> { + if (sampleId == updateSampleId) { + return; + } + Set beforeCnaEvents = beforeSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + Set afterCnaEvents = afterSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + assertEquals("CNA events for sample_id=" + sampleId + " must not change.", beforeCnaEvents, afterCnaEvents); + }); + Map newSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(newSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 208l, CNA.HOMDEL, + 3265l, CNA.AMP, + 4893l, CNA.HOMDEL, + 672l, CNA.AMP, + 673l, CNA.AMP, + 675l, CNA.HOMDEL, + newGeneEntrezId, CNA.HOMDEL + ), + newSampleEntrezGeneIdToCnaAlteration); + Map updatedSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(updateSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 10000l, CNA.HOMDEL, + 207l, CNA.AMP, + 3845l, CNA.AMP, + 673l, CNA.HOMDEL, + newGeneEntrezId, CNA.AMP + ), + updatedSampleEntrezGeneIdToCnaAlteration); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java new file mode 100644 index 00000000..3162f6a3 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java @@ -0,0 +1,136 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdToEntityId; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdsToEntityIds; + +/** + * Tests Incremental Import of Generic Assay data + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalGenericAssayImporter { + + /** + * Test incremental upload of GENERIC_ASSAY + */ + @Test + public void testGenericAssay() throws DaoException, IOException { + /** + * Prior checks + */ + // Stable id that is part of the platform, but absent during the incremental upload + final String absentStableId = "L-685458"; + final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); + final Set beforeStableIds = new HashSet<>(noChangeStableIds); + beforeStableIds.add(absentStableId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + // stable_id: TCGA-A1-A0SE-01 + final int noChangeSampleId = 3; + final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); + + GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + assertNotNull(ic50Profile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); + assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/generic_assay/"); + File metaFile = new File(dataFolder, "meta_treatment_ic50.txt"); + File dataFile = new File(dataFolder, "data_treatment_ic50.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + assertEquals("After result should have +1 amount of entries", beforeResult.size() + 1, afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, geneStableIdsToEntityIds(noChangeStableIds), Set.of(noChangeSampleId)); + int erlotinibEntityId = geneStableIdToEntityId("Erlotinib"); + assertEquals(">8", afterResult.get(erlotinibEntityId).get(newSampleId)); + assertEquals("7.5", afterResult.get(erlotinibEntityId).get(updateSampleId)); + int irinotecanEntityId = geneStableIdToEntityId("Irinotecan"); + assertEquals("", afterResult.get(irinotecanEntityId).get(newSampleId)); + assertEquals("0.081", afterResult.get(irinotecanEntityId).get(updateSampleId)); + int absentEntityId = geneStableIdToEntityId(absentStableId); + assertEquals("", afterResult.get(absentEntityId).get(newSampleId)); + assertEquals("", afterResult.get(absentEntityId).get(updateSampleId)); + int lapatinibEntityId = geneStableIdToEntityId("Lapatinib"); + assertEquals("6.2", afterResult.get(lapatinibEntityId).get(newSampleId)); + assertEquals("7.848", afterResult.get(lapatinibEntityId).get(updateSampleId)); + int lbw242EntityId = geneStableIdToEntityId("LBW242"); + assertEquals("0.1", afterResult.get(lbw242EntityId).get(newSampleId)); + assertEquals(">~8", afterResult.get(lbw242EntityId).get(updateSampleId)); + assertNotNull("New generic entity has to be added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java new file mode 100644 index 00000000..c629ecb4 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java @@ -0,0 +1,81 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportTabDelimData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdToEntityId; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdsToEntityIds; + +/** + * Tests Incremental Import is not supported for GSVA data type + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalGsvaImporter { + @Test + public void testGsvaIsNotSupported() throws DaoException, IOException { + GeneticProfile gsvaProfile = new GeneticProfile(); + gsvaProfile.setCancerStudyId(DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub").getInternalId()); + gsvaProfile.setStableId("gsva_scores"); + gsvaProfile.setDatatype("GENESET_SCORE"); + gsvaProfile.setGeneticAlterationType(GeneticAlterationType.GENESET_SCORE); + gsvaProfile.setProfileName("gsva test platform"); + DaoGeneticProfile.addGeneticProfile(gsvaProfile); + + assertThrows(UnsupportedOperationException.class, () -> + new ImportTabDelimData(File.createTempFile("gsva", "test"), + DaoGeneticProfile.getGeneticProfileByStableId("gsva_scores").getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance())); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java new file mode 100644 index 00000000..d44ccee5 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java @@ -0,0 +1,119 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of MRNA_EXPRESSION Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +@Rollback +@Transactional +public class TestIncrementalMrnaExpressionImport { + + /** + * Test incremental upload of MRNA_EXPRESSION + */ + @Test + public void testMrnaExpression() throws DaoException, IOException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); + assertNotNull(mrnaProfile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/mrna_expression/"); + File metaFile = new File(dataFolder, "meta_expression_Zscores.txt"); + File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample", beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + HashMap newGeneRow = afterResult.get(newGeneEntrezId); + assertEquals("-0.1735", newGeneRow.get(newSampleId)); + assertEquals("-0.6412", newGeneRow.get(updateSampleId)); + HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); + assertEquals("", absentGeneRow.get(newSampleId)); + assertEquals("", absentGeneRow.get(updateSampleId)); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java new file mode 100644 index 00000000..f3933b27 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java @@ -0,0 +1,122 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of PROTEIN_LEVEL Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +@Rollback +@Transactional +public class TestIncrementalProteinLevelImport { + + /** + * Test incremental upload of PROTEIN_LEVEL + */ + @Test + public void testRppa() throws DaoException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); + assertNotNull(rppaProfile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/protein_level/"); + File metaFile = new File(dataFolder, "meta_rppa.txt"); + File dataFile = new File(dataFolder, "data_rppa.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java deleted file mode 100644 index 4f4b2aef..00000000 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ /dev/null @@ -1,441 +0,0 @@ -/* - * This file is part of cBioPortal. - * - * cBioPortal is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . -*/ - -package org.mskcc.cbio.portal.integrationTest.incremental; - -import org.cbioportal.model.CNA; -import org.jetbrains.annotations.NotNull; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.mskcc.cbio.portal.dao.DaoCancerStudy; -import org.mskcc.cbio.portal.dao.DaoCnaEvent; -import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.dao.DaoGeneOptimized; -import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; -import org.mskcc.cbio.portal.dao.DaoGeneticEntity; -import org.mskcc.cbio.portal.dao.DaoGeneticProfile; -import org.mskcc.cbio.portal.model.CnaEvent; -import org.mskcc.cbio.portal.model.GeneticAlterationType; -import org.mskcc.cbio.portal.model.GeneticProfile; -import org.mskcc.cbio.portal.scripts.ImportTabDelimData; -import org.springframework.test.annotation.Rollback; -import org.springframework.test.context.ContextConfiguration; -import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; -import org.springframework.transaction.annotation.Transactional; - -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertThrows; -import static org.junit.Assert.assertTrue; - -/** - * Tests Incremental Import of Tab Delimited Data. - * - * @author Ruslan Forostianov - * @author Pieter Lukasse - */ -@RunWith(SpringJUnit4ClassRunner.class) -@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) -@Rollback -@Transactional -public class TestIncrementalTabDelimData { - - @Before - public void setUp() throws DaoException { - DaoCancerStudy.reCacheAll(); - } - - /** - * Test incremental upload of MRNA_EXPRESSION - */ - @Test - public void testMrnaExpression() throws DaoException, IOException { - /** - * Prior checks - */ - // Hugo_Symbol: CDK1 - final long newGeneEntrezId = 983l; - // Gene that is part of the platform, but absent during the incremental upload - // Hugo_Symbol: ARAF - final long absentGeneEntrezId = 369l; - final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); - final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); - beforeEntrezIds.add(absentGeneEntrezId); - - // stable_id: TCGA-A1-A0SB-01 - final int newSampleId = 1; - // stable_id: TCGA-A1-A0SD-01 - final int updateSampleId = 2; - final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); - final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); - beforeSampleIds.add(updateSampleId); - - GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); - assertNotNull(mrnaProfile); - - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); - - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); - File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); - - /** - * Test - */ - new ImportTabDelimData(dataFile, - mrnaProfile.getGeneticProfileId(), - null, - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance()).importData(); - - /** - * After test assertions - */ - HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, - afterResult.size()); - afterResult.values() - .forEach(sampleToValue -> - assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); - assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); - HashMap newGeneRow = afterResult.get(newGeneEntrezId); - assertEquals("-0.1735", newGeneRow.get(newSampleId)); - assertEquals("-0.6412", newGeneRow.get(updateSampleId)); - HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); - assertEquals("", absentGeneRow.get(newSampleId)); - assertEquals("", absentGeneRow.get(updateSampleId)); - } - - /** - * Test incremental upload of PROTEIN_LEVEL - */ - @Test - public void testRppa() throws DaoException, IOException { - /** - * Prior checks - */ - // Hugo_Symbol: CDK1 - final long newGeneEntrezId = 983l; - // Gene that is part of the platform, but absent during the incremental upload - // Hugo_Symbol: ARAF - final long absentGeneEntrezId = 369l; - final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); - final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); - beforeEntrezIds.add(absentGeneEntrezId); - - // stable_id: TCGA-A1-A0SB-01 - final int newSampleId = 1; - // stable_id: TCGA-A1-A0SD-01 - final int updateSampleId = 2; - final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); - final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); - beforeSampleIds.add(updateSampleId); - - GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); - assertNotNull(rppaProfile); - - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); - - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); - File dataFile = new File(dataFolder, "data_rppa.txt"); - - /** - * Test - */ - new ImportTabDelimData(dataFile, - rppaProfile.getGeneticProfileId(), - null, - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance()).importData(); - - /** - * After test assertions - */ - HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, - afterResult.size()); - afterResult.values() - .forEach(sampleToValue -> - assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); - assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); - assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); - assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); - assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); - assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); - } - - /** - * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE (gistic) - */ - @Test - public void testDiscreteCNA() throws DaoException, IOException { - /** - * Prior checks - */ - // Hugo_Symbol: CDK1 - final long newGeneEntrezId = 983l; - // Gene that is part of the platform, but absent during the incremental upload - // Hugo_Symbol: ATM - final long absentGeneEntrezId = 472l; - final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); - final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); - beforeEntrezIds.add(absentGeneEntrezId); - - // stable_id: TCGA-XX-0800 - final int newSampleId = 15; - // stable_id: TCGA-A1-A0SO - final int updateSampleId = 12; - final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); - final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); - beforeSampleIds.add(updateSampleId); - - final Set afterSampleIds = new HashSet<>(beforeSampleIds); - afterSampleIds.add(newSampleId); - - GeneticProfile discreteCNAProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); - assertNotNull(discreteCNAProfile); - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); - - List allCnaLevels = Arrays.stream(CNA.values()).map(CNA::getCode).toList(); - Set beforeCnaEventsSampleIds = Set.of(4, 13, 14, updateSampleId); - List beforeSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), - null, - discreteCNAProfile.getGeneticProfileId(), - allCnaLevels); - Map> beforeSampleIdToSampleCnaEvents = beforeSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); - assertEquals(beforeCnaEventsSampleIds, beforeSampleIdToSampleCnaEvents.keySet()); - - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); - File dataFile = new File(dataFolder, "data_cna_discrete.txt"); - File pdAnnotations = new File(dataFolder, "data_cna_pd_annotations.txt"); - - /** - * Test - */ - ImportTabDelimData importer = new ImportTabDelimData(dataFile, - discreteCNAProfile.getGeneticProfileId(), - null, - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance()); - importer.setPdAnnotationsFile(pdAnnotations); - importer.importData(); - - /** - * After test assertions - */ - HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); - assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, - afterResult.size()); - afterResult.values() - .forEach(sampleToValue -> - assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); - assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); - assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); - assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); - assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); - assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); - - List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), - afterResult.keySet(), - discreteCNAProfile.getGeneticProfileId(), - allCnaLevels); - Map> afterSampleIdToSampleCnaEvents = afterSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); - assertEquals("There is only one new sample that has to gain cna events", beforeCnaEventsSampleIds.size() + 1, afterSampleIdToSampleCnaEvents.size()); - beforeCnaEventsSampleIds.forEach(sampleId -> { - if (sampleId == updateSampleId) { - return; - } - Set beforeCnaEvents = beforeSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); - Set afterCnaEvents = afterSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); - assertEquals("CNA events for sample_id=" + sampleId + " must not change.", beforeCnaEvents, afterCnaEvents); - }); - Map newSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(newSampleId).stream() - .map(CnaEvent::getEvent) - .collect(Collectors.toMap( - event -> event.getGene().getEntrezGeneId(), - CnaEvent.Event::getAlteration)); - assertEquals(Map.of( - 208l, CNA.HOMDEL, - 3265l, CNA.AMP, - 4893l, CNA.HOMDEL, - 672l, CNA.AMP, - 673l, CNA.AMP, - 675l, CNA.HOMDEL, - newGeneEntrezId, CNA.HOMDEL - ), - newSampleEntrezGeneIdToCnaAlteration); - Map updatedSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(updateSampleId).stream() - .map(CnaEvent::getEvent) - .collect(Collectors.toMap( - event -> event.getGene().getEntrezGeneId(), - CnaEvent.Event::getAlteration)); - assertEquals(Map.of( - 10000l, CNA.HOMDEL, - 207l, CNA.AMP, - 3845l, CNA.AMP, - 673l, CNA.HOMDEL, - newGeneEntrezId, CNA.AMP - ), - updatedSampleEntrezGeneIdToCnaAlteration); - } - - @Test - public void testGsvaIsNotSupported() throws DaoException, IOException { - GeneticProfile gsvaProfile = new GeneticProfile(); - gsvaProfile.setCancerStudyId(DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub").getInternalId()); - gsvaProfile.setStableId("gsva_scores"); - gsvaProfile.setDatatype("GENESET_SCORE"); - gsvaProfile.setGeneticAlterationType(GeneticAlterationType.GENESET_SCORE); - gsvaProfile.setProfileName("gsva test platform"); - DaoGeneticProfile.addGeneticProfile(gsvaProfile); - - assertThrows(UnsupportedOperationException.class, () -> - new ImportTabDelimData(File.createTempFile("gsva", "test"), - DaoGeneticProfile.getGeneticProfileByStableId("gsva_scores").getGeneticProfileId(), - null, - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance())); - } - - /** - * Test incremental upload of GENERIC_ASSAY - */ - @Test - public void testGenericAssay() throws DaoException, IOException { - /** - * Prior checks - */ - // Stable id that is part of the platform, but absent during the incremental upload - final String absentStableId = "L-685458"; - final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); - final Set beforeStableIds = new HashSet<>(noChangeStableIds); - beforeStableIds.add(absentStableId); - - // stable_id: TCGA-A1-A0SB-01 - final int newSampleId = 1; - // stable_id: TCGA-A1-A0SD-01 - final int updateSampleId = 2; - // stable_id: TCGA-A1-A0SE-01 - final int noChangeSampleId = 3; - final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); - - GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); - assertNotNull(ic50Profile); - - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); - Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); - assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); - - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); - File dataFile = new File(dataFolder, "data_treatment_ic50.txt"); - - /** - * Test - */ - new ImportTabDelimData( - dataFile, - null, - ic50Profile.getGeneticProfileId(), - null, - "NAME,DESCRIPTION,URL", - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance()).importData(); - - /** - * After test assertions - */ - HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); - assertEquals("After result should have the same amount of entries", beforeResult.size(), afterResult.size()); - afterResult.values() - .forEach(sampleToValue -> - assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); - assertNoChange(beforeResult, afterResult, geneStableIdsToEntityIds(noChangeStableIds), Set.of(noChangeSampleId)); - int erlotinibEntityId = geneStableIdToEntityId("Erlotinib"); - assertEquals(">8", afterResult.get(erlotinibEntityId).get(newSampleId)); - assertEquals("7.5", afterResult.get(erlotinibEntityId).get(updateSampleId)); - int irinotecanEntityId = geneStableIdToEntityId("Irinotecan"); - assertEquals("", afterResult.get(irinotecanEntityId).get(newSampleId)); - assertEquals("0.081", afterResult.get(irinotecanEntityId).get(updateSampleId)); - int absentEntityId = geneStableIdToEntityId(absentStableId); - assertEquals("", afterResult.get(absentEntityId).get(newSampleId)); - assertEquals("", afterResult.get(absentEntityId).get(updateSampleId)); - int lapatinibEntityId = geneStableIdToEntityId("Lapatinib"); - assertEquals("6.2", afterResult.get(lapatinibEntityId).get(newSampleId)); - assertEquals("7.848", afterResult.get(lapatinibEntityId).get(updateSampleId)); - assertNull("No new generic entity has been added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); - } - - @NotNull - private Set geneStableIdsToEntityIds(Set beforeStableIds) { - return beforeStableIds.stream().map(stableId -> { - try { - return geneStableIdToEntityId(stableId); - } catch (DaoException e) { - throw new RuntimeException(e); - } - }).collect(Collectors.toSet()); - } - - private int geneStableIdToEntityId(String stableId) throws DaoException { - return DaoGeneticEntity.getGeneticEntityByStableId(stableId).getId(); - } - - private void assertPriorDataState(HashMap> beforeResult, Set expectedEntityIds, Set expectedSampleIds) { - assertEquals(expectedEntityIds, beforeResult.keySet()); - beforeResult.forEach((entityId, sampleIdToValue) -> { - assertEquals("Samples for gene with entityId = " + entityId + " have to match expected ones", - expectedSampleIds, beforeResult.get(entityId).keySet()); - }); - } - - private void assertNoChange(HashMap> beforeResult, - HashMap> afterResult, - Set entityIds, - Set sampleIds) { - entityIds.forEach(entityId -> { - assertTrue("After result is expected to contain entityId=" + entityId, - afterResult.containsKey(entityId)); - sampleIds.forEach(sampleId -> { - assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entityId=" + entityId, - afterResult.get(entityId).containsKey(sampleId)); - assertEquals("The values for sample_id=" + sampleId + - " and entityId=" + entityId + " before and after upload have to match.", - beforeResult.get(entityId).get(sampleId), afterResult.get(entityId).get(sampleId)); - }); - }); - } - -} diff --git a/src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt b/src/test/resources/incremental/copy_number_alteration/data_cna_discrete.txt similarity index 100% rename from src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt rename to src/test/resources/incremental/copy_number_alteration/data_cna_discrete.txt diff --git a/src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt b/src/test/resources/incremental/copy_number_alteration/data_cna_pd_annotations.txt similarity index 100% rename from src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt rename to src/test/resources/incremental/copy_number_alteration/data_cna_pd_annotations.txt diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt new file mode 100644 index 00000000..827c31dd --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete.txt +pd_annotations_filename: data_cna_pd_annotations.txt +namespaces: CustomNamespace diff --git a/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt b/src/test/resources/incremental/generic_assay/data_treatment_ic50.txt similarity index 83% rename from src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt rename to src/test/resources/incremental/generic_assay/data_treatment_ic50.txt index 5edb7cfa..79606fbf 100644 --- a/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt +++ b/src/test/resources/incremental/generic_assay/data_treatment_ic50.txt @@ -4,5 +4,5 @@ Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan 0.081 # The database has this entity, but not the file #L-685458 Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib 6.2 7.848 -#The line will be skipped as the entity stable id is not in the database already +#The entity will be added LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 0.1 >~8 diff --git a/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt b/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt new file mode 100644 index 00000000..6ec6cdc5 --- /dev/null +++ b/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt @@ -0,0 +1,12 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt similarity index 100% rename from src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt rename to src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt diff --git a/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt b/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt new file mode 100644 index 00000000..e761fed3 --- /dev/null +++ b/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: MRNA_EXPRESSION +datatype: Z-SCORE +stable_id: mrna +profile_description: Expression levels (Agilent microarray). +show_profile_in_analysis_tab: false +profile_name: mRNA expression (microarray) +data_filename: data_expression_Zscores.txt diff --git a/src/test/resources/incremental/tab_delim_data/data_rppa.txt b/src/test/resources/incremental/protein_level/data_rppa.txt similarity index 100% rename from src/test/resources/incremental/tab_delim_data/data_rppa.txt rename to src/test/resources/incremental/protein_level/data_rppa.txt diff --git a/src/test/resources/incremental/protein_level/meta_rppa.txt b/src/test/resources/incremental/protein_level/meta_rppa.txt new file mode 100644 index 00000000..f6481c7d --- /dev/null +++ b/src/test/resources/incremental/protein_level/meta_rppa.txt @@ -0,0 +1,7 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: PROTEIN_LEVEL +datatype: LOG2-VALUE +stable_id: rppa +profile_name: Test RPPA +profile_description: Test protein level data +data_filename: data_rppa.txt diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index 1222ce67..552db83e 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -334,7 +334,7 @@ INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (7,'study_tcga_pub_structural_variants',1,'STRUCTURAL_VARIANT','SV','Structural Variants','Structural Variants test data.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (8,'study_tcga_pub_cna_long',1,'COPY_NUMBER_ALTERATION','DISCRETE_LONG','CNA values','CNA values dummy data','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (9,'study_tcga_pub_rppa',1,'PROTEIN_LEVEL','LOG2-VALUE','RPPA values','RPPA values dummy data','0'); -INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (10,'study_tcga_pub_treatment_ic50',1,'GENERIC_ASSAY','TREATMENT_RESPONSE','test treatment values','treatment values dummy data','0'); +INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (10,'study_tcga_pub_treatment_ic50',1,'GENERIC_ASSAY','LIMIT-VALUE','test treatment values','treatment values dummy data','0'); -- gene_panel INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_CNA_DISCRETE_LONG_FORMAT','Some test panel'); From e5ccc3e90959514a1b3426e4789d2bd34834f2a7 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 00:27:44 +0200 Subject: [PATCH 066/130] Move counting lines if file inside generic assay patient level data uploader --- .../portal/scripts/ImportGenericAssayPatientLevelData.java | 3 ++- .../java/org/mskcc/cbio/portal/scripts/ImportProfileData.java | 3 +-- .../scripts/TestImportGenericAssayPatientLevelData.java | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index a7dda2ca..dddcc156 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -79,7 +79,8 @@ public ImportGenericAssayPatientLevelData(File dataFile, String targetLine, int * @throws IOException IO Error. * @throws DaoException Database Error. */ - public void importData(int numLines) throws IOException, DaoException { + public void importData() throws IOException, DaoException { + int numLines = FileUtil.getNumLines(dataFile); geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index 10759baa..d5b6241a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -83,7 +83,6 @@ public void run() { } // Print profile report - int numLines = FileUtil.getNumLines(dataFile); ProgressMonitor.setCurrentMessage( " --> profile id: " + geneticProfile.getGeneticProfileId() + "\n --> profile name: " + geneticProfile.getProfileName() + @@ -118,7 +117,7 @@ public void run() { String patientLevel = geneticProfile.getOtherMetaDataField("patient_level"); if (patientLevel != null && patientLevel.trim().toLowerCase().equals("true")) { ImportGenericAssayPatientLevelData genericAssayProfileImporter = new ImportGenericAssayPatientLevelData(dataFile, geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties")); - genericAssayProfileImporter.importData(numLines); + genericAssayProfileImporter.importData(); } else { // use ImportTabDelimData importer for non-patient level data ImportTabDelimData genericAssayProfileImporter = new ImportTabDelimData( diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java index 123715f8..480e9a61 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java @@ -53,7 +53,6 @@ import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.scripts.ImportGenericAssayPatientLevelData; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -151,8 +150,7 @@ private void runImportGenericAssayPatientLevelData() throws DaoException, IOExce File file = new File("src/test/resources/tabDelimitedData/data_patient_generic_assay.txt"); ImportGenericAssayPatientLevelData parser = new ImportGenericAssayPatientLevelData(file, null, geneticProfileId, null, "name,description"); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + parser.importData(); HashMap> geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, Arrays.asList(geneticEntity1.getId(), geneticEntity2.getId())); From 472f47ec88d9056db9d3bf81410f64dcd8c4b303 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 10:13:14 +0200 Subject: [PATCH 067/130] Give error that generic asssay patient level data is not supported --- .../portal/scripts/ImportProfileData.java | 3 + .../TestIncrementalGenericAssayImporter.java | 74 +++++++++++++------ .../data_treatment_ic50_patient_level.txt | 8 ++ .../meta_treatment_ic50_patient_level.txt | 13 ++++ 4 files changed, 74 insertions(+), 24 deletions(-) create mode 100644 src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt create mode 100644 src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index d5b6241a..0e1ff058 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -116,6 +116,9 @@ public void run() { // use a different importer for patient level data String patientLevel = geneticProfile.getOtherMetaDataField("patient_level"); if (patientLevel != null && patientLevel.trim().toLowerCase().equals("true")) { + if (overwriteExisting) { + throw new UnsupportedOperationException("Incremental upload for generic assay patient_level data is not supported. Please use sample level instead."); + } ImportGenericAssayPatientLevelData genericAssayProfileImporter = new ImportGenericAssayPatientLevelData(dataFile, geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties")); genericAssayProfileImporter.importData(); } else { diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java index 3162f6a3..e0ef8cf5 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java @@ -41,6 +41,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdToEntityId; @@ -58,34 +59,28 @@ @Transactional public class TestIncrementalGenericAssayImporter { + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + // stable_id: TCGA-A1-A0SE-01 + final int noChangeSampleId = 3; + final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); + + // Stable id that is part of the platform, but absent during the incremental upload + final String absentStableId = "L-685458"; + final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); + final Set beforeStableIds = new HashSet<>(noChangeStableIds); + { beforeStableIds.add(absentStableId); } + + private GeneticProfile ic50Profile; + private HashMap> beforeResult; + /** * Test incremental upload of GENERIC_ASSAY */ @Test - public void testGenericAssay() throws DaoException, IOException { - /** - * Prior checks - */ - // Stable id that is part of the platform, but absent during the incremental upload - final String absentStableId = "L-685458"; - final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); - final Set beforeStableIds = new HashSet<>(noChangeStableIds); - beforeStableIds.add(absentStableId); - - // stable_id: TCGA-A1-A0SB-01 - final int newSampleId = 1; - // stable_id: TCGA-A1-A0SD-01 - final int updateSampleId = 2; - // stable_id: TCGA-A1-A0SE-01 - final int noChangeSampleId = 3; - final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); - - GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); - assertNotNull(ic50Profile); - - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); - Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); - assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); + public void testGenericAssay() throws DaoException { File dataFolder = new File("src/test/resources/incremental/generic_assay/"); File metaFile = new File(dataFolder, "meta_treatment_ic50.txt"); @@ -128,9 +123,40 @@ public void testGenericAssay() throws DaoException, IOException { assertNotNull("New generic entity has to be added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); } + /** + * Test that incremental upload of GENERIC_ASSAY (patient level) is not supported + */ + @Test + public void testGenericAssayPatientLevel() throws DaoException { + + File dataFolder = new File("src/test/resources/incremental/generic_assay/"); + File metaFile = new File(dataFolder, "meta_treatment_ic50_patient_level.txt"); + File dataFile = new File(dataFolder, "data_treatment_ic50_patient_level.txt"); + + /** + * Test + */ + assertThrows("Incremental upload for generic assay patient_level data is not supported. Please use sample level instead.", + RuntimeException.class, () -> { + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + }); + } + @Before public void setUp() throws DaoException { DaoCancerStudy.reCacheAll(); + + ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + assertNotNull(ic50Profile); + + beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); + assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); } } diff --git a/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt b/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt new file mode 100644 index 00000000..34753bba --- /dev/null +++ b/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt @@ -0,0 +1,8 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB TCGA-A1-A0SD +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 7.5 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan 0.081 +# The database has this entity, but not the file +#L-685458 +Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib 6.2 7.848 +#The entity will be added +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 0.1 >~8 diff --git a/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt b/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt new file mode 100644 index 00000000..181899f5 --- /dev/null +++ b/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt @@ -0,0 +1,13 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50_patient_level.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL +patient_level: true From c54e303ea4793cc384461288391cd318583f22bb Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 14:05:24 +0200 Subject: [PATCH 068/130] Clean sample_cna_event despite whether it has alteration_driver_annotation rows or not --- src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java index 0e4ab7e8..f19bf514 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java @@ -129,7 +129,7 @@ public static void removeSampleCnaEvents(int cnaProfileId, List sampleI pstmt = con.prepareStatement ("DELETE sample_cna_event, alteration_driver_annotation" + " FROM sample_cna_event" + - " JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + + " LEFT JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + " WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + ")"); From 18dbdd33dc98390f7285649413eabe751d5b10a6 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 13:33:11 +0200 Subject: [PATCH 069/130] Fix cbioportalImport script execution args variable was not declared --- scripts/importer/cbioportalImporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index ea9cfa50..c2f65cc0 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -653,5 +653,5 @@ def main(args): # ready to roll if __name__ == '__main__': - parsed_args = interface(args) + parsed_args = interface() main(parsed_args) From c702a8bc80f2b123911e7cb28c1bc3a9948e1c93 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 16:18:14 +0200 Subject: [PATCH 070/130] Remove not needed spring context initialisation that caused different errors to occur --- .../mskcc/cbio/portal/dao/DaoCancerStudy.java | 23 ++++++++--- .../cbio/portal/dao/DaoGeneticProfile.java | 16 +++++--- .../cbio/portal/dao/DaoReferenceGenome.java | 13 +++--- .../cbio/portal/scripts/AddCaseList.java | 9 ++-- .../portal/scripts/ImportCancerStudy.java | 8 ++-- .../portal/scripts/ImportClinicalData.java | 41 +++++++++++++++---- .../scripts/ImportCopyNumberSegmentData.java | 32 +++++++++++---- .../cbio/portal/scripts/ImportCosmicData.java | 23 +++++++---- .../cbio/portal/scripts/ImportDrugs.java | 13 ++++-- .../cbio/portal/scripts/ImportGeneData.java | 37 ++++++++++++----- .../scripts/ImportGenePanelProfileMap.java | 32 +++++++++++---- .../cbio/portal/scripts/ImportGisticData.java | 18 ++++---- .../mskcc/cbio/portal/scripts/ImportHprd.java | 14 +++++-- .../portal/scripts/ImportMicroRNAIDs.java | 19 ++++++--- .../cbio/portal/scripts/ImportMutSigData.java | 11 ++--- .../scripts/ImportPathwayCommonsExtSif.java | 14 +++++-- .../portal/scripts/ImportProfileData.java | 18 ++++---- .../portal/scripts/ImportReferenceGenome.java | 27 +++++++----- .../portal/scripts/ImportResourceData.java | 40 +++++++++++++----- .../scripts/ImportResourceDefinition.java | 30 ++++++++++---- .../cbio/portal/scripts/ImportSampleList.java | 24 ++++++++--- .../mskcc/cbio/portal/scripts/ImportSif.java | 18 +++++--- .../portal/scripts/ImportTimelineData.java | 23 +++++++---- .../portal/scripts/ImportTypesOfCancers.java | 3 -- .../cbio/portal/scripts/ImportUsers.java | 19 +++++---- .../scripts/NormalizeExpressionLevels.java | 18 ++++++-- .../portal/scripts/RemoveCancerStudy.java | 5 +-- .../cbio/portal/scripts/ResetDatabase.java | 23 +++++++++-- .../scripts/TransactionalScriptRunner.java | 8 ++-- .../portal/scripts/UpdateCancerStudy.java | 5 +-- .../cbio/portal/scripts/UpdateMetaData.java | 7 ++-- 31 files changed, 405 insertions(+), 186 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java index 4073dbbb..64e9ca59 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCancerStudy.java @@ -32,12 +32,24 @@ package org.mskcc.cbio.portal.dao; -import java.sql.*; -import java.text.*; -import java.util.*; import org.apache.commons.lang3.StringUtils; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CancerStudyTags; +import org.mskcc.cbio.portal.model.ReferenceGenome; +import org.mskcc.cbio.portal.model.TypeOfCancer; + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.Set; /** * Analogous to and replaces the old DaoCancerType. A CancerStudy has a NAME and @@ -61,7 +73,6 @@ public static enum Status { private static final Map byInternalId = new HashMap(); static { - SpringUtil.initDataSource(); reCacheAll(); } diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticProfile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticProfile.java index 0326fb2f..baf5f530 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticProfile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticProfile.java @@ -32,10 +32,17 @@ package org.mskcc.cbio.portal.dao; -import java.sql.*; -import java.util.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.SpringUtil; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; /** * Analogous to and replaces the old DaoCancerType. A CancerStudy has a NAME and @@ -52,7 +59,6 @@ private DaoGeneticProfile() {} private static final Map> byStudy = new HashMap>(); static { - SpringUtil.initDataSource(); reCache(); } diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoReferenceGenome.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoReferenceGenome.java index 1d9bb499..9f33d7e4 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoReferenceGenome.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoReferenceGenome.java @@ -17,11 +17,15 @@ package org.mskcc.cbio.portal.dao; -import java.sql.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.SpringUtil; +import org.mskcc.cbio.portal.model.ReferenceGenome; -import java.util.*; +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.HashMap; +import java.util.Map; /** @@ -36,7 +40,6 @@ public final class DaoReferenceGenome { private static final Map genomeInternalIds = new HashMap(); static { - SpringUtil.initDataSource(); reCache(); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/AddCaseList.java b/src/main/java/org/mskcc/cbio/portal/scripts/AddCaseList.java index acc067c6..e91ebef4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/AddCaseList.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/AddCaseList.java @@ -23,9 +23,6 @@ package org.mskcc.cbio.portal.scripts; -import java.util.ArrayList; -import java.util.List; - import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoSample; import org.mskcc.cbio.portal.dao.DaoSampleList; @@ -33,9 +30,10 @@ import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.model.SampleList; import org.mskcc.cbio.portal.model.SampleListCategory; -import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; -import org.mskcc.cbio.portal.util.SpringUtil; + +import java.util.ArrayList; +import java.util.List; /** * Command Line tool to Add new case lists by generating them based on some rules. @@ -131,7 +129,6 @@ public void run() { throw new UsageException(progName, null, argSpec, "cancer_study_identifier is not specified."); } - SpringUtil.initDataSource(); CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyIdentifier); if (theCancerStudy == null) { throw new IllegalArgumentException("cancer study identified by cancer_study_identifier '" diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCancerStudy.java index 03372dbb..75ab1f91 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCancerStudy.java @@ -32,8 +32,11 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CancerStudyTags; +import org.mskcc.cbio.portal.util.CancerStudyReader; +import org.mskcc.cbio.portal.util.CancerStudyTagsReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; import java.io.File; @@ -53,7 +56,6 @@ public void run() { } File file = new File(args[0]); - SpringUtil.initDataSource(); CancerStudy cancerStudy = CancerStudyReader.loadCancerStudy(file); CancerStudyTags cancerStudyTags = CancerStudyTagsReader.loadCancerStudyTags(file, cancerStudy); String message = "Loaded the following cancer study:" + diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java index 0e37c9e0..d15beed6 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportClinicalData.java @@ -32,16 +32,40 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.apache.commons.collections4.map.MultiKeyMap; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoClinicalAttributeMeta; +import org.mskcc.cbio.portal.dao.DaoClinicalData; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ClinicalAttribute; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.SurvivalAttributeUtil; import org.mskcc.cbio.portal.util.SurvivalAttributeUtil.SurvivalStatusAttributes; -import java.io.*; -import joptsimple.*; -import java.util.*; -import java.util.regex.*; -import org.apache.commons.collections4.map.MultiKeyMap; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.Properties; +import java.util.Set; +import java.util.regex.Matcher; public class ImportClinicalData extends ConsoleRunnable { @@ -677,7 +701,6 @@ public void run() { overwriteExisting = true; } - SpringUtil.initDataSource(); CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); if (cancerStudy == null) { throw new IllegalArgumentException("Unknown cancer study: " + cancerStudyStableId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java index 1c876a75..92343aa3 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java @@ -32,15 +32,30 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.model.*; - -import joptsimple.*; - -import java.io.*; +import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegment; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegmentFile; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CopyNumberSegment; +import org.mskcc.cbio.portal.model.CopyNumberSegmentFile; +import org.mskcc.cbio.portal.model.ReferenceGenome; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; import java.math.BigDecimal; -import java.util.*; +import java.util.Properties; /** * Import Segment data into database. @@ -118,7 +133,6 @@ public void run() { ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile); - SpringUtil.initDataSource(); CancerStudy cancerStudy = getCancerStudy(properties); if (segmentDataExistsForCancerStudy(cancerStudy)) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java index cea48b84..3cb549c8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java @@ -32,16 +32,26 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.dao.DaoCosmicData; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.CosmicMutationFrequency; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.MutationKeywordUtils; +import org.mskcc.cbio.portal.util.ProgressMonitor; import org.springframework.util.Assert; -import java.io.*; - +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; import java.util.HashMap; import java.util.Map; -import java.util.regex.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import java.util.stream.IntStream; public class ImportCosmicData { @@ -180,7 +190,6 @@ public static void main(String[] args) throws Exception { System.out.println("command line usage: importCosmicData.pl "); return; } - SpringUtil.initDataSource(); DaoCosmicData.deleteAllRecords(); ProgressMonitor.setConsoleMode(true); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java index a819b0de..17e03c2e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java @@ -32,11 +32,17 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.io.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; /** * Command Line tool to import background drug information. @@ -79,7 +85,6 @@ public static void main(String[] args) throws Exception { return; } ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); File file = new File(args[0]); System.out.println("Reading drug data from: " + file.getAbsolutePath()); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java index cc3300c0..80a062c6 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java @@ -32,19 +32,36 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.model.ReferenceGenome; -import org.mskcc.cbio.portal.model.ReferenceGenomeGene; -import org.mskcc.cbio.portal.util.*; - import joptsimple.OptionException; import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; - -import java.io.*; -import java.util.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoReferenceGenome; +import org.mskcc.cbio.portal.dao.DaoReferenceGenomeGene; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.ReferenceGenome; +import org.mskcc.cbio.portal.model.ReferenceGenomeGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.EntrezValidator; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.GlobalProperties; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; +import java.util.Set; /** * Command Line Tool to Import Background Gene Data. @@ -684,8 +701,6 @@ static void importSuppGeneData(File suppGeneFile, String referenceGenomeBuild) t @Override public void run() { try { - SpringUtil.initDataSource(); - String description = "Update gene / gene alias tables "; // using a real options parser, helps avoid bugs diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java index 8e0d77c4..483fa7c2 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java @@ -32,13 +32,30 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; -import joptsimple.*; -import java.util.*; +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoGenePanel; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.GenePanel; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Properties; /** * @@ -96,7 +113,6 @@ public void run() { } setFile(genePanel_f); - SpringUtil.initDataSource(); importData(); } catch (RuntimeException e) { throw e; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGisticData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGisticData.java index e35a9ed6..f575202e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGisticData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGisticData.java @@ -32,15 +32,19 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.ArrayList; - +import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGistic; import org.mskcc.cbio.portal.model.Gistic; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.GisticReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; import org.mskcc.cbio.portal.validate.validationException; -import joptsimple.OptionSet; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; /** @@ -57,8 +61,6 @@ public void run () { String dataFile = (String) options.valueOf("data"); String studyId = (String) options.valueOf("study"); - SpringUtil.initDataSource(); - File gistic_f = new File(dataFile); int cancerStudyInternalId = ValidationUtils.getInternalStudyId(studyId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportHprd.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportHprd.java index 1cbe26aa..1102d618 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportHprd.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportHprd.java @@ -32,11 +32,18 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.io.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; /** * Command Line to Import HPRD Interactions. @@ -122,7 +129,6 @@ public static void main(String[] args) { return; } ProgressMonitor.setConsoleModeAndParseShowProgress(args); - SpringUtil.initDataSource(); try { File geneFile = new File(args[0]); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportMicroRNAIDs.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportMicroRNAIDs.java index 929565fe..66b639c1 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportMicroRNAIDs.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportMicroRNAIDs.java @@ -32,11 +32,21 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; /** * Command Line Tool to Import Background Gene Data. @@ -48,7 +58,6 @@ public static void importData(File geneFile) throws IOException, DaoException { FileReader reader = new FileReader(geneFile); BufferedReader buf = new BufferedReader(reader); String line = buf.readLine(); // skip first line - SpringUtil.initDataSource(); DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); List mirnas = new ArrayList(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportMutSigData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportMutSigData.java index f03f751f..7c0f80ac 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportMutSigData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportMutSigData.java @@ -32,10 +32,12 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.util.*; - import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.MutSigReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; import java.io.File; import java.io.IOException; @@ -54,8 +56,7 @@ public void run() { OptionSet options = ConsoleUtil.parseStandardDataAndStudyOptions(args, description); String dataFile = (String) options.valueOf("data"); String studyId = (String) options.valueOf("study"); - SpringUtil.initDataSource(); - + File mutSigFile = new File(dataFile); ProgressMonitor.setCurrentMessage( "Reading data from: " + mutSigFile.getAbsolutePath()); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportPathwayCommonsExtSif.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportPathwayCommonsExtSif.java index a17f783b..73a864c8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportPathwayCommonsExtSif.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportPathwayCommonsExtSif.java @@ -32,11 +32,18 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.io.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; /** * Command Line to Import HPRD Interactions. @@ -119,7 +126,6 @@ public static void main(String[] args) { return; } ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); try { File sifFile = new File(args[0]); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index 0e1ff058..193b7cf5 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -32,14 +32,17 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.Set; - -import joptsimple.*; +import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.GeneticProfileReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; +import java.io.File; +import java.util.Set; import static org.cbioportal.model.MolecularProfile.ImportType.DISCRETE_LONG; @@ -70,7 +73,6 @@ public void run() { updateInfo = true; } boolean overwriteExisting = options.has("overwrite-existing"); - SpringUtil.initDataSource(); ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getAbsolutePath()); // Load genetic profile and gene panel GeneticProfile geneticProfile = null; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportReferenceGenome.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportReferenceGenome.java index 1230608a..0e3b032e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportReferenceGenome.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportReferenceGenome.java @@ -17,20 +17,27 @@ package org.mskcc.cbio.portal.scripts; -import org.apache.commons.lang3.StringUtils; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.ReferenceGenome; -import org.mskcc.cbio.portal.util.*; - import joptsimple.OptionException; import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; - -import java.io.*; -import java.util.*; -import java.text.SimpleDateFormat; +import org.apache.commons.lang3.StringUtils; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoReferenceGenome; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.ReferenceGenome; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; import java.text.ParseException; +import java.text.SimpleDateFormat; +import java.util.HashSet; +import java.util.Set; /** * Command Line Tool to Import Reference Genome Used by Molecular Profiling. @@ -108,8 +115,6 @@ private static void addReferenceGenomesToDB(Set referenceGenome @Override public void run() { try { - SpringUtil.initDataSource(); - String description = "Update reference_genome table "; // using a real options parser, helps avoid bugs diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java index 147d59d9..ea1e3730 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java @@ -1,16 +1,37 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; -import joptsimple.*; -import java.util.*; -import java.util.stream.Collectors; - +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; import org.apache.commons.collections4.map.MultiKeyMap; import org.cbioportal.model.ResourceType; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.DaoResourceData; +import org.mskcc.cbio.portal.dao.DaoResourceDefinition; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.model.ResourceDefinition; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.MissingValues; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Properties; +import java.util.Set; +import java.util.stream.Collectors; public class ImportResourceData extends ConsoleRunnable { @@ -437,7 +458,6 @@ public void run() { relaxed = true; } - SpringUtil.initDataSource(); CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); if (cancerStudy == null) { throw new IllegalArgumentException("Unknown cancer study: " + cancerStudyStableId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java index 5811ee44..a4b7b7ec 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceDefinition.java @@ -1,13 +1,28 @@ package org.mskcc.cbio.portal.scripts; +import joptsimple.OptionException; +import joptsimple.OptionParser; +import joptsimple.OptionSet; +import joptsimple.OptionSpec; import org.cbioportal.model.ResourceType; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; -import joptsimple.*; -import java.util.*; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoResourceDefinition; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ResourceDefinition; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.MissingValues; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; public class ImportResourceDefinition extends ConsoleRunnable { @@ -290,7 +305,6 @@ public void run() { relaxed = true; } - SpringUtil.initDataSource(); CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyStableId); if (cancerStudy == null) { throw new IllegalArgumentException("Unknown cancer study: " + cancerStudyStableId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java index 356471e7..ea85fe03 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSampleList.java @@ -32,13 +32,26 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.model.SampleList; +import org.mskcc.cbio.portal.model.SampleListCategory; +import org.mskcc.cbio.portal.util.CaseList; +import org.mskcc.cbio.portal.util.CaseListReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; import org.mskcc.cbio.portal.validate.CaseListValidator; -import java.io.*; -import java.util.*; +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; /** * Command Line tool to Import Sample Lists. @@ -50,7 +63,6 @@ public static void importSampleList(File dataFile) throws IOException, DaoExcept CaseList caseList = CaseListReader.readFile(dataFile); CaseListValidator.validateAll(caseList); - SpringUtil.initDataSource(); CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(caseList.getCancerStudyIdentifier()); if (theCancerStudy == null) { throw new IllegalArgumentException("cancer study identified by cancer_study_identifier '" diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSif.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSif.java index 75589416..6763e41c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportSif.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportSif.java @@ -32,12 +32,20 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.io.*; -import java.util.*; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; /** * Command Line to Import SIF Interactions. @@ -151,8 +159,6 @@ public static void main(String[] args) { } ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); - try { File geneFile = new File(args[0]); String dataSource = args[1]; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java index 0b5b182b..4b12a431 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java @@ -32,14 +32,24 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.*; -import joptsimple.*; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; +import joptsimple.OptionSet; +import org.mskcc.cbio.portal.dao.DaoClinicalEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.ClinicalEvent; +import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; -import org.mskcc.cbio.portal.util.SpringUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileReader; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; +import java.util.Properties; /** * Imports timeline data for display in patient view @@ -50,7 +60,6 @@ public class ImportTimelineData extends ConsoleRunnable { private static void importData(String dataFile, int cancerStudyId) throws IOException, DaoException { MySQLbulkLoader.bulkLoadOn(); - SpringUtil.initDataSource(); ProgressMonitor.setCurrentMessage("Reading file " + dataFile); FileReader reader = new FileReader(dataFile); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java index 9e119f03..4fb17193 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java @@ -37,10 +37,8 @@ import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoTypeOfCancer; import org.mskcc.cbio.portal.model.TypeOfCancer; -import org.mskcc.cbio.portal.scripts.ConsoleRunnable; import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; -import org.mskcc.cbio.portal.util.SpringUtil; /** * Load all the types of cancer and their names from a file. @@ -76,7 +74,6 @@ public void run() { public static void load(File file, boolean clobber) throws IOException, DaoException { ProgressMonitor.setCurrentMessage("Loading cancer types..."); List typeOfCancerList = parseCancerTypesFromFile(file); - SpringUtil.initDataSource(); if (clobber) { ProgressMonitor.setCurrentMessage("Deleting all previous cancer types..."); DaoTypeOfCancer.deleteAllRecords(); //TODO - remove this option - foreign key constraints may mean large cascade effects (possibly the deletion of all studies) - instead, change the option to 'deleteTypeOfCancerIfNotPresent' and add a loop through existing typeOfCancer records, removing those which are not in the parsed typeOfCancerList diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java index c886c8b8..7a3ed177 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java @@ -33,12 +33,19 @@ package org.mskcc.cbio.portal.scripts; // imports -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; -import java.io.*; -import java.util.*; +import org.mskcc.cbio.portal.dao.DaoUser; +import org.mskcc.cbio.portal.dao.DaoUserAuthorities; +import org.mskcc.cbio.portal.model.User; +import org.mskcc.cbio.portal.model.UserAuthorities; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.util.Arrays; +import java.util.List; /** * Import a file of users and their authorities. @@ -62,8 +69,6 @@ public static void main(String[] args) throws Exception { ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); - File file = new File(args[0]); FileReader reader = new FileReader(file); BufferedReader buf = new BufferedReader(reader); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java b/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java index a5c8d642..d2214c93 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/NormalizeExpressionLevels.java @@ -32,14 +32,25 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.*; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.util.SpringUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import java.io.BufferedReader; +import java.io.FileNotFoundException; +import java.io.FileReader; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Locale; + /** * * Given expression and CNV data for a set of samples generate normalized expression values. @@ -110,7 +121,6 @@ public class NormalizeExpressionLevels{ public static void main (String[]args) { try { - SpringUtil.initDataSource(); // init dao gene daoGeneOptimized = DaoGeneOptimized.getInstance(); driver(args); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/RemoveCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/scripts/RemoveCancerStudy.java index ad515683..8ededd3a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/RemoveCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/RemoveCancerStudy.java @@ -32,9 +32,9 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.util.ProgressMonitor; /** * Command Line Tool to Remove a Single Cancer Study. @@ -51,8 +51,7 @@ public void run() { ""); } String cancerStudyIdentifier = args[0]; - - SpringUtil.initDataSource(); + ProgressMonitor.setCurrentMessage( "Checking if Cancer study with identifier " + cancerStudyIdentifier + diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ResetDatabase.java b/src/main/java/org/mskcc/cbio/portal/scripts/ResetDatabase.java index 82cad899..43c0b95d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ResetDatabase.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ResetDatabase.java @@ -32,8 +32,26 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoClinicalData; +import org.mskcc.cbio.portal.dao.DaoClinicalEvent; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegmentFile; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneset; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; +import org.mskcc.cbio.portal.dao.DaoInfo; +import org.mskcc.cbio.portal.dao.DaoMutSig; +import org.mskcc.cbio.portal.dao.DaoMutation; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleList; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.DaoTypeOfCancer; +import org.mskcc.cbio.portal.dao.DaoUser; +import org.mskcc.cbio.portal.dao.DaoUserAuthorities; /** * Empty the database. @@ -91,7 +109,6 @@ public static void resetDatabase() throws DaoException { } public static void main(String[] args) throws DaoException { - SpringUtil.initDataSource(); StatDatabase.statDb(); ResetDatabase.resetDatabase(); System.err.println("Database Cleared and Reset."); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/TransactionalScriptRunner.java b/src/main/java/org/mskcc/cbio/portal/scripts/TransactionalScriptRunner.java index 7ee8d94a..b7124de0 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/TransactionalScriptRunner.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/TransactionalScriptRunner.java @@ -1,14 +1,13 @@ package org.mskcc.cbio.portal.scripts; -import java.io.File; - -import org.mskcc.cbio.portal.util.SpringUtil; import org.mskcc.cbio.portal.util.TransactionalScripts; import org.springframework.context.support.FileSystemXmlApplicationContext; import org.springframework.transaction.TransactionStatus; import org.springframework.transaction.support.TransactionCallback; import org.springframework.transaction.support.TransactionTemplate; +import java.io.File; + /** * A high-level script runner than can be used to run a batch of scripts within a * transactional context. It's handy loading a batch of data of different types. @@ -69,8 +68,7 @@ public void run () { // Inject the context into SpringUtil, so we don't need to initialize again. // This ensures that the XML files from the command line provide a complete // context and we don't get data sources later from anywhere else. - SpringUtil.initDataSource(context); - + // Set up the transaction template transactionTemplate = (TransactionTemplate) context.getBean("scriptTransactionTemplate"); if (transactionTemplate == null) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCancerStudy.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCancerStudy.java index af8a5ba9..00bcbf69 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCancerStudy.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCancerStudy.java @@ -23,10 +23,10 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.util.*; import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.model.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.util.ProgressMonitor; /** @@ -58,7 +58,6 @@ public void run() { "Invalid study status parameter: " + cancerStudyStatus); } - SpringUtil.initDataSource(); CancerStudy theCancerStudy = DaoCancerStudy.getCancerStudyByStableId(cancerStudyIdentifier); if (theCancerStudy == null) { throw new IllegalArgumentException("cancer study identified by cancer_study_identifier '" diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateMetaData.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateMetaData.java index d9669ed5..8235ed3a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateMetaData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateMetaData.java @@ -32,9 +32,11 @@ package org.mskcc.cbio.portal.scripts; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.util.*; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.GeneticProfileReader; +import org.mskcc.cbio.portal.util.ProgressMonitor; import java.io.File; @@ -51,7 +53,6 @@ public static void main(String[] args) throws Exception { } ProgressMonitor.setConsoleMode(true); - SpringUtil.initDataSource(); File descriptorFile = new File(args[0]); GeneticProfile geneticProfile = GeneticProfileReader.loadGeneticProfileFromMeta(descriptorFile); From 0ff7031efc1153fde8cbaf9d5aa003184b59a7b9 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 16:19:26 +0200 Subject: [PATCH 071/130] Make error message more informative when gene panel is not found Do not throw NPE, but NSEE with error message that mentions panel id --- .../java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java index 16ab5098..3e27de9f 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java @@ -87,6 +87,9 @@ public static boolean outlierExpressionSelected(HashSet geneticProfileId public static int getGenePanelId(String panelId) { GenePanel genePanel = DaoGenePanel.getGenePanelByStableId(panelId); + if (genePanel == null) { + throw new NoSuchElementException("No gene panel with id " + genePanel); + } return genePanel.getInternalId(); } From 54cc04e94ec58f414994061d390cb73e578b84f6 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 16:27:44 +0200 Subject: [PATCH 072/130] Add more genes to the mini seed to load study_es_0 --- src/test/resources/seed_mini.sql | 63 ++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index 552db83e..1fa9d4e1 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -199,6 +199,69 @@ INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYP INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); SET @max_entity_id = (Select MAX(ID) from genetic_entity); INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,2261,'FGFR3','protein-coding'); + +-- missing genes for study_es_0 +-- additional genes for CNA data +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,116983,'ACAP3','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,2073,'ERCC5','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,219293,'ATAD3C','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,375790,'AGRN','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,54998,'AURKAIP1','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,55210,'ATAD3A','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,83858,'ATAD3B','protein-coding'); +-- genes for data_methylation_hm27.txt +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,24145,'PANX1','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,283234,'CCDC88B','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,3232,'HOXD3','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,3613,'IMPA2','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,389,'RHOC','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,487,'ATP2A1','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,7871,'SLMAP','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,8148,'TAF15','protein-coding'); +-- gene panels +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,55,'ACP3','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,81061,'OR11H1','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,388946,'TMEM247','protein-coding'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); +SET @max_entity_id = (Select MAX(ID) from genetic_entity); +INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,7157,'TP53','protein-coding'); + -- Generic genetic entities INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Erlotinib'); INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Irinotecan'); From a022aabd66725e1bc4c3d8d593a03925ea715b50 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 16:29:01 +0200 Subject: [PATCH 073/130] Make study_es_0_inc data pass validation --- tests/test_data/study_es_0_inc/data_cna_discrete.txt | 1 - tests/test_data/study_es_0_inc/data_cna_log2.txt | 1 - tests/test_data/study_es_0_inc/data_cna_pd_annotations.txt | 5 +++++ tests/test_data/study_es_0_inc/data_expression_median.txt | 1 - tests/test_data/study_es_0_inc/data_methylation_hm27.txt | 1 - tests/test_data/study_es_0_inc/data_treatment_ic50.txt | 1 - 6 files changed, 5 insertions(+), 5 deletions(-) create mode 100644 tests/test_data/study_es_0_inc/data_cna_pd_annotations.txt diff --git a/tests/test_data/study_es_0_inc/data_cna_discrete.txt b/tests/test_data/study_es_0_inc/data_cna_discrete.txt index 7915f45b..518b727c 100644 --- a/tests/test_data/study_es_0_inc/data_cna_discrete.txt +++ b/tests/test_data/study_es_0_inc/data_cna_discrete.txt @@ -4,7 +4,6 @@ ACAP3 116983 0 0 -1 ATAD3A 55210 0 0 -2 ATAD3B 83858 -2 -1 0 ATAD3C 219293 0 0 0 -#AURKAIP1 54998 ERCC5 2073 0 -1 -2 ACP3 55 0 0 0 TP53 -1 0 -2 diff --git a/tests/test_data/study_es_0_inc/data_cna_log2.txt b/tests/test_data/study_es_0_inc/data_cna_log2.txt index 0eb820a7..bb0fdb32 100644 --- a/tests/test_data/study_es_0_inc/data_cna_log2.txt +++ b/tests/test_data/study_es_0_inc/data_cna_log2.txt @@ -4,7 +4,6 @@ ACAP3 116983 0.751 0.533 0.114 ATAD3A 55210 0.487 0.695 0.364 ATAD3B 83858 0.150 0.492 0.300 ATAD3C 219293 0.995 0.170 0.654 -#AURKAIP1 54998 ERCC5 2073 0.816 0.514 0.165 ACP3 55 0.252 0.713 0.513 TP53 0.360 0.538 0.891 diff --git a/tests/test_data/study_es_0_inc/data_cna_pd_annotations.txt b/tests/test_data/study_es_0_inc/data_cna_pd_annotations.txt new file mode 100644 index 00000000..53d372d2 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_pd_annotations.txt @@ -0,0 +1,5 @@ +SAMPLE_ID Entrez_Gene_Id cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +TCGA-A1-A0SB-01 116983 Putative_Passenger Test passenger Class 2 Class annotation +TCGA-A1-A0SB-01 375790 Putative_Driver Test driver Class 1 Class annotation +TCGA-A1-A0SB-03 219293 Putative_Passenger Test passenger +TCGA-BH-NEW 2073 Putative_Driver Test driver diff --git a/tests/test_data/study_es_0_inc/data_expression_median.txt b/tests/test_data/study_es_0_inc/data_expression_median.txt index d5c4a9a0..7e1f5a4b 100644 --- a/tests/test_data/study_es_0_inc/data_expression_median.txt +++ b/tests/test_data/study_es_0_inc/data_expression_median.txt @@ -4,7 +4,6 @@ ACAP3 116983 0.096 0.826 0.032 ATAD3A 55210 0.569 0.189 0.266 ATAD3B 83858 0.829 0.473 0.611 ATAD3C 219293 0.307 0.445 0.045 -#AURKAIP1 54998 ERCC5 2073 0.171 0.766 0.590 ACP3 55 0.422 0.870 0.745 TP53 0.179 0.694 0.808 diff --git a/tests/test_data/study_es_0_inc/data_methylation_hm27.txt b/tests/test_data/study_es_0_inc/data_methylation_hm27.txt index d2c67abc..3db35409 100644 --- a/tests/test_data/study_es_0_inc/data_methylation_hm27.txt +++ b/tests/test_data/study_es_0_inc/data_methylation_hm27.txt @@ -4,7 +4,6 @@ ACAP3 116983 0.022 0.681 0.790 ATAD3A 55210 0.229 0.946 0.439 ATAD3B 83858 0.885 0.707 0.664 ATAD3C 219293 0.660 0.315 0.694 -#AURKAIP1 54998 ERCC5 2073 0.436 0.749 0.345 ACP3 55 0.622 0.396 0.029 TP53 0.563 0.686 0.607 diff --git a/tests/test_data/study_es_0_inc/data_treatment_ic50.txt b/tests/test_data/study_es_0_inc/data_treatment_ic50.txt index 2a507cef..806799de 100644 --- a/tests/test_data/study_es_0_inc/data_treatment_ic50.txt +++ b/tests/test_data/study_es_0_inc/data_treatment_ic50.txt @@ -6,6 +6,5 @@ AZD6244 Name of AZD6244 Desc of AZD6244 Url of AZD6244 >8 >8 >8 Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 >8 >8 Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan NA 0.083 NA L-685458 Name of L-685458 Desc of L-685458 Url of L-685458 >8 >8 3.267752409 -#Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 NA >8 >8 Nilotinib Name of Nilotinib Desc of Nilotinib Url of Nilotinib >8 >8 NA From 90cc928a22eb4ea8846ce9d6a71bac2271500727 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 16:29:53 +0200 Subject: [PATCH 074/130] Document in README how to load study_es_0 study --- README.md | 31 +++++++++++++++++++++++++++++-- 1 file changed, 29 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 51ccaf64..2acafd19 100644 --- a/README.md +++ b/README.md @@ -16,9 +16,36 @@ Build docker image with: docker build -t cbioportal-core . ``` -Example of how to start loading of the whole study: +### Example of how to load `study_es_0` study + +Import gene panels + +```bash +docker run -it -v $(pwd)/tests/test_data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +perl importGenePanel.pl --data /data/study_es_0/data_gene_panel_testpanel1.txt +docker run -it -v $(pwd)/tests/test_data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +perl importGenePanel.pl --data /data/study_es_0/data_gene_panel_testpanel2.txt +``` + +Import gene sets and supplementary data + +```bash +docker run -it -v $(pwd)/src/test/resources/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +perl importGenesetData.pl --data /data/genesets/study_es_0_genesets.gmt --new-version msigdb_7.5.1 --supp /data/genesets/study_es_0_supp-genesets.txt +``` + +Import gene set hierarchy data + +```bash +docker run -it -v $(pwd)/src/test/resources/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +perl importGenesetHierarchy.pl --data /data/genesets/study_es_0_tree.yaml +``` + +Import study + ```bash -docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -s /data/study_es_0 -p /data/api_json -o +docker run -it -v $(pwd)/tests/test_data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core \ +python importer/metaImport.py -s /data/study_es_0 -p /data/api_json_system_tests -o ``` ### Incremental upload of data From fb75d7c82566c7fdb8e58bdec40bfe50cae7fb22 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 14:05:24 +0200 Subject: [PATCH 075/130] Implement incremental upload for timeline data --- scripts/importer/cbioportal_common.py | 1 + .../cbio/portal/dao/DaoClinicalEvent.java | 27 +++- .../portal/scripts/ImportTimelineData.java | 19 ++- .../mskcc/cbio/portal/util/ConsoleUtil.java | 4 +- .../TestIncrementalTimelineImport.java | 115 ++++++++++++++++++ .../incremental/clinical/data_timeline.txt | 5 + .../incremental/clinical/meta_timeline.txt | 4 + tests/system_tests_import_data.py | 3 + .../study_es_0_inc/data_timeline.txt | 4 + .../study_es_0_inc/meta_timeline.txt | 4 + 10 files changed, 176 insertions(+), 10 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTimelineImport.java create mode 100644 src/test/resources/incremental/clinical/data_timeline.txt create mode 100644 src/test/resources/incremental/clinical/meta_timeline.txt create mode 100644 tests/test_data/study_es_0_inc/data_timeline.txt create mode 100644 tests/test_data/study_es_0_inc/meta_timeline.txt diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index 798174ee..c68f68e3 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -380,6 +380,7 @@ class MetaFileTypes(object): MetaFileTypes.GENERIC_ASSAY_CONTINUOUS, MetaFileTypes.GENERIC_ASSAY_BINARY, MetaFileTypes.GENERIC_ASSAY_CATEGORICAL, + MetaFileTypes.TIMELINE, ] IMPORTER_CLASSNAME_BY_META_TYPE = { diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java index 21722902..d5045721 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java @@ -32,6 +32,9 @@ package org.mskcc.cbio.portal.dao; +import org.apache.commons.lang3.StringUtils; +import org.mskcc.cbio.portal.model.ClinicalEvent; + import java.sql.Connection; import java.sql.PreparedStatement; import java.sql.ResultSet; @@ -40,8 +43,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import org.apache.commons.lang3.StringUtils; -import org.mskcc.cbio.portal.model.ClinicalEvent; /** * @@ -52,7 +53,7 @@ private DaoClinicalEvent() {} public static int addClinicalEvent(ClinicalEvent clinicalEvent) { if (!MySQLbulkLoader.isBulkLoad()) { - throw new IllegalStateException("Only buld load mode is allowed for importing clinical events"); + throw new IllegalStateException("Only bulk load mode is allowed for importing clinical events"); } MySQLbulkLoader.getMySQLbulkLoader("clinical_event").insertRecord( @@ -202,6 +203,26 @@ public static void deleteByCancerStudyId(int cancerStudyId) throws DaoException JdbcUtil.closeAll(DaoClinicalEvent.class, con, pstmt, rs); } } + + public static void deleteByPatientId(int patientId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoClinicalEvent.class); + + pstmt = con.prepareStatement("DELETE clinical_event, clinical_event_data" + + " FROM clinical_event" + + " LEFT JOIN clinical_event_data ON clinical_event_data.CLINICAL_EVENT_ID = clinical_event.CLINICAL_EVENT_ID" + + " WHERE clinical_event.PATIENT_ID = ?"); + pstmt.setInt(1, patientId); + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoClinicalEvent.class, con, pstmt, rs); + } + } public static void deleteAllRecords() throws DaoException { Connection con = null; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java index 4b12a431..c16eba21 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java @@ -48,8 +48,10 @@ import java.io.FileReader; import java.io.IOException; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; import java.util.Properties; +import java.util.Set; /** * Imports timeline data for display in patient view @@ -58,7 +60,7 @@ */ public class ImportTimelineData extends ConsoleRunnable { - private static void importData(String dataFile, int cancerStudyId) throws IOException, DaoException { + private static void importData(String dataFile, int cancerStudyId, boolean overwriteExisting) throws IOException, DaoException { MySQLbulkLoader.bulkLoadOn(); ProgressMonitor.setCurrentMessage("Reading file " + dataFile); @@ -81,9 +83,10 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce throw new RuntimeException("The first line must start with\n'PATIENT_ID\tSTART_DATE\tEVENT_TYPE'\nor\n" + "PATIENT_ID\tSTART_DATE\tSTOP_DATE\tEVENT_TYPE"); } - + long clinicalEventId = DaoClinicalEvent.getLargestClinicalEventId(); - + Set processedPatientIds = new HashSet<>(); + while ((line = buff.readLine()) != null) { line = line.trim(); @@ -99,6 +102,9 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce ProgressMonitor.logWarning("Patient " + patientId + " not found in study " + cancerStudyId + ". Skipping entry."); continue; } + if (overwriteExisting && processedPatientIds.add(patient.getInternalId())) { + DaoClinicalEvent.deleteByPatientId(patient.getInternalId()); + } ClinicalEvent event = new ClinicalEvent(); event.setClinicalEventId(++clinicalEventId); event.setPatientId(patient.getInternalId()); @@ -128,17 +134,18 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce public void run() { try { String description = "Import 'timeline' data"; - - OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); + + OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, false); String dataFile = (String) options.valueOf("data"); File descriptorFile = new File((String) options.valueOf("meta")); + boolean overwriteExisting = options.has("overwrite-existing"); Properties properties = new TrimmedProperties(); properties.load(new FileInputStream(descriptorFile)); int cancerStudyInternalId = ValidationUtils.getInternalStudyId(properties.getProperty("cancer_study_identifier")); - importData(dataFile, cancerStudyInternalId); + importData(dataFile, cancerStudyInternalId, overwriteExisting); } catch (RuntimeException e) { throw e; } catch (IOException|DaoException e) { diff --git a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java index 39fedd5f..7eba9610 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java @@ -138,8 +138,10 @@ public static OptionSet parseStandardDataAndMetaOptions(String[] args, String de parser.accepts( "loadMode", "direct (per record) or bulk load of data" ) .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); } + parser.accepts("overwrite-existing", + "Enables re-uploading molecular data that already exist for the given profile and sample.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); String progName = "importScript"; - + OptionSet options = null; try { options = parser.parse( args ); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTimelineImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTimelineImport.java new file mode 100644 index 00000000..c077c58b --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTimelineImport.java @@ -0,0 +1,115 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoClinicalEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoPatient; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ClinicalEvent; +import org.mskcc.cbio.portal.model.Patient; +import org.mskcc.cbio.portal.scripts.ImportTimelineData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.List; +import java.util.Map; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNull; + +/** + * Tests Incremental Import of Timeline Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalTimelineImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + + @Test + public void testTimelineDataReloading() throws DaoException { + MySQLbulkLoader.bulkLoadOn(); + ClinicalEvent event = new ClinicalEvent(); + event.setClinicalEventId(1L); + Patient sbPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "TCGA-A1-A0SB"); + event.setPatientId(sbPatient.getInternalId()); + event.setStartDate(5L); + event.setEventType("SPECIMEN"); + event.setEventData(Map.of("SPECIMEN_SITE", "specimen_site_to_erase")); + DaoClinicalEvent.addClinicalEvent(event); + MySQLbulkLoader.flushAll(); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/clinical/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_timeline.txt"); + File dataFile = new File(singleTcgaSampleFolder, "data_timeline.txt"); + + ImportTimelineData importTimelineData = new ImportTimelineData(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importTimelineData.run(); + + List sbClinicalEvents = DaoClinicalEvent.getClinicalEvent(sbPatient.getInternalId()); + assertEquals(2, sbClinicalEvents.size()); + ClinicalEvent sbSpecimen = sbClinicalEvents.stream().filter(ce -> ce.getEventType().equals("SPECIMEN")).findFirst().get(); + assertEquals(20L, sbSpecimen.getStartDate()); + assertEquals(60L, sbSpecimen.getStopDate()); + assertEquals(Map.of( + "SPECIMEN_SITE", "test_specimen_site_1", + "SPECIMEN_TYPE", "test_specimen_type", + "SOURCE", "test_source_3" + ), sbSpecimen.getEventData()); + ClinicalEvent sbStatus = sbClinicalEvents.stream().filter(ce -> ce.getEventType().equals("STATUS")).findFirst().get(); + assertEquals(10L, sbStatus.getStartDate()); + assertEquals(20L, sbStatus.getStopDate()); + assertEquals(Map.of("SOURCE", "test_source_4"), sbStatus.getEventData()); + + Patient sdPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "TCGA-A1-A0SD"); + List sdClinicalEvents = DaoClinicalEvent.getClinicalEvent(sdPatient.getInternalId()); + assertEquals(1, sdClinicalEvents.size()); + ClinicalEvent sdStatus = sdClinicalEvents.stream().filter(ce -> ce.getEventType().equals("STATUS")).findFirst().get(); + assertEquals(45L, sdStatus.getStartDate()); + assertNull(sdStatus.getStopDate()); + assertEquals(Map.of("SOURCE", "test_source_2"), sdStatus.getEventData()); + + Patient nonexistentPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "NONEXISTENT_PATIENT"); + assertNull(nonexistentPatient); + } + +} diff --git a/src/test/resources/incremental/clinical/data_timeline.txt b/src/test/resources/incremental/clinical/data_timeline.txt new file mode 100644 index 00000000..679a9da5 --- /dev/null +++ b/src/test/resources/incremental/clinical/data_timeline.txt @@ -0,0 +1,5 @@ +PATIENT_ID START_DATE STOP_DATE EVENT_TYPE SPECIMEN_SITE SPECIMEN_TYPE SOURCE +TCGA-A1-A0SB 20 60 SPECIMEN test_specimen_site_1 test_specimen_type test_source_3 +TCGA-A1-A0SB 10 20 STATUS test_source_4 +TCGA-A1-A0SD 45 STATUS test_source_2 +NONEXISTENT_PATIENT 100 200 STATUS test_source_1 diff --git a/src/test/resources/incremental/clinical/meta_timeline.txt b/src/test/resources/incremental/clinical/meta_timeline.txt new file mode 100644 index 00000000..bacded8c --- /dev/null +++ b/src/test/resources/incremental/clinical/meta_timeline.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: CLINICAL +datatype: TIMELINE +data_filename: data_timeline.txt diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index 64361571..5fd45a69 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -117,6 +117,8 @@ def test_incremental_load(self, run_java, locate_jar): '--meta', f'{data_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_methylation_hm27.txt', '--noprogress') treatment_ic50_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_treatment_ic50.txt', '--noprogress') + timeline_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportTimelineData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_timeline.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_timeline.txt', '--noprogress') case_list_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists') @@ -130,6 +132,7 @@ def test_incremental_load(self, run_java, locate_jar): expression_median_call, methylation_hm27_call, treatment_ic50_call, + timeline_call, case_list_call, ]) diff --git a/tests/test_data/study_es_0_inc/data_timeline.txt b/tests/test_data/study_es_0_inc/data_timeline.txt new file mode 100644 index 00000000..e950603c --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_timeline.txt @@ -0,0 +1,4 @@ +PATIENT_ID START_DATE STOP_DATE EVENT_TYPE SPECIMEN_SITE SPECIMEN_TYPE SOURCE +TCGA-BH-A18K 20 60 SPECIMEN test_specimen_site_1 test_specimen_type test_source_3 +TCGA-BH-A18K 10 20 STATUS test_source_4 +TCGA-BH-NEW 100 200 STATUS test_source_1 diff --git a/tests/test_data/study_es_0_inc/meta_timeline.txt b/tests/test_data/study_es_0_inc/meta_timeline.txt new file mode 100644 index 00000000..51a46508 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_timeline.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: CLINICAL +datatype: TIMELINE +data_filename: data_timeline.txt From 3331223237cf44d9c976cc0d0afbd7649b0a573d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 22 May 2024 09:55:22 +0200 Subject: [PATCH 076/130] Implement incremental upload of CNA DISCRETE long data --- .../scripts/ImportCnaDiscreteLongData.java | 149 ++++++++++++++++-- .../portal/scripts/ImportProfileData.java | 9 +- .../portal/util/GeneticProfileReader.java | 19 ++- ...IncrementalCopyNumberAlterationImport.java | 102 ++++++++---- .../data_cna_discrete_long.txt | 37 +++++ .../meta_cna_discrete_long.txt | 9 ++ 6 files changed, 268 insertions(+), 57 deletions(-) create mode 100644 src/test/resources/incremental/copy_number_alteration/data_cna_discrete_long.txt create mode 100644 src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index f03e5b45..76954aa4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -36,6 +36,7 @@ import static com.google.common.collect.Lists.*; import static java.lang.String.*; import static org.cbioportal.model.MolecularProfile.DataType.DISCRETE; +import static org.cbioportal.model.MolecularProfile.ImportType.DISCRETE_LONG; public class ImportCnaDiscreteLongData { @@ -49,22 +50,50 @@ public class ImportCnaDiscreteLongData { private int samplesSkipped = 0; private Set namespaces; + private boolean updateMode; + + private GeneticProfile geneticProfile; + private final ArrayList sampleIdGeneticProfileIds = new ArrayList<>(); + private ArrayList orderedImportedSampleList; + private ArrayList orderedSampleList; + + private HashMap> geneticAlterationMap; public ImportCnaDiscreteLongData( - File cnaFile, - int geneticProfileId, - String genePanel, - DaoGeneOptimized daoGene, - DaoGeneticAlteration daoGeneticAlteration, - Set namespaces + File cnaFile, + int geneticProfileId, + String genePanel, + DaoGeneOptimized daoGene, + DaoGeneticAlteration daoGeneticAlteration, + Set namespaces, + boolean updateMode ) { this.namespaces = namespaces; this.cnaFile = cnaFile; this.geneticProfileId = geneticProfileId; + this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); + if (!Set.of(DISCRETE.name(), DISCRETE_LONG.name()).contains(geneticProfile.getDatatype())) { + throw new IllegalStateException("Platform " + + geneticProfileId + + " has not supported datatype: " + + geneticProfile.getDatatype()); + } this.genePanel = genePanel; this.daoGene = daoGene; this.geneticAlterationGeneImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); + this.updateMode = updateMode; + } + + public ImportCnaDiscreteLongData( + File cnaFile, + int geneticProfileId, + String genePanel, + DaoGeneOptimized daoGene, + DaoGeneticAlteration daoGeneticAlteration, + Set namespaces + ) { + this(cnaFile, geneticProfileId, genePanel, daoGene, daoGeneticAlteration, namespaces, false); } public void importData() throws Exception { @@ -76,8 +105,9 @@ public void importData() throws Exception { int lineIndex = 1; String[] headerParts = line.split("\t", -1); this.cnaUtil = new CnaUtil(headerParts, this.namespaces); - - GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); + if (updateMode) { + geneticAlterationMap = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(geneticProfile.getGeneticProfileId(), null); + } boolean isDiscretizedCnaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION @@ -97,10 +127,22 @@ public void importData() throws Exception { this.extractDataToImport(geneticProfile, line, lineIndex, toImport); } - DaoGeneticProfileSamples.addGeneticProfileSamples( - geneticProfileId, - newArrayList(toImport.eventsTable.columnKeySet()) - ); + orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); + if (updateMode) { + ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(geneticProfileId); + int initialOrderSampleListSize = savedOrderedSampleList.size(); + checkSamplesInDataEqualTo(initialOrderSampleListSize); + // add all new sample ids at the end + ArrayList extendedSampleList = new ArrayList<>(savedOrderedSampleList); + List newSampleIds = orderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); + extendedSampleList.addAll(newSampleIds); + orderedImportedSampleList = orderedSampleList; + orderedSampleList = extendedSampleList; + + + DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(geneticProfileId); + } + DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); for (Long entrezId : toImport.eventsTable.rowKeySet()) { boolean added = storeGeneticAlterations(toImport, entrezId); @@ -117,6 +159,7 @@ public void importData() throws Exception { ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + getSamplesSkipped()); buf.close(); + expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); MySQLbulkLoader.flushAll(); } @@ -130,8 +173,7 @@ public void extractDataToImport( int lineIndex, CnaImportData importContainer ) throws Exception { - boolean hasData = !line.startsWith("#") && line.trim().length() > 0; - if (!hasData) { + if (!FileUtil.isInfoLine(line)) { return; } String[] lineParts = line.split("\t", -1); @@ -147,6 +189,12 @@ public void extractDataToImport( String sampleIdStr = cnaUtil.getSampleIdStr(lineParts); Sample sample = findSample(sampleIdStr, cancerStudyId); + if (sample == null) { + if (StableIdUtil.isNormal(sampleIdStr)) { + return; + } + throw new RuntimeException("Sample with stable id " + sampleIdStr + " is not found in the database."); + } createSampleProfile(sample); long entrezId = gene.getEntrezGeneId(); @@ -175,6 +223,9 @@ private void storeCnaEvents(CnaImportData toImport, Long entrezId) throws DaoExc .filter(v -> v.cnaEvent != null) .map(v -> v.cnaEvent) .collect(Collectors.toList()); + if (updateMode) { + DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedImportedSampleList); + } CnaUtil.storeCnaEvents(existingCnaEvents, events); } @@ -213,7 +264,75 @@ private boolean storeGeneticAlterations(CnaImportData toImport, Long entrezId) t ? gene.get().getHugoGeneSymbolAllCaps() : "" + entrezId; - return this.geneticAlterationGeneImporter.store(values, gene.get(), geneSymbol); + return saveValues(gene.get(), values, geneSymbol); + } + + //TODO duplicate + private void checkSamplesInDataEqualTo(int initialOrderSampleListSize) { + geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { + if (sampleToValue.size() != initialOrderSampleListSize) { + throw new IllegalStateException("Number of samples (" + + sampleToValue.size() + ") for genetic entity with id " + + geneticEntityId + " does not match with the number in the inital sample list (" + + initialOrderSampleListSize + ")."); + } + }); + } + + //TODO duplicate + private boolean saveValues(CanonicalGene canonicalGene, String[] values, String geneSymbol) throws DaoException { + if (updateMode) { + values = updateValues(canonicalGene.getGeneticEntityId(), values); + if (!geneticAlterationGeneImporter.isImportedAlready(canonicalGene)) { + DaoGeneticAlteration.getInstance().deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); + } + } + return geneticAlterationGeneImporter.store(values, canonicalGene, geneSymbol); + } + //TODO duplicate + private boolean saveValues(int geneticEntityId, String[] values) throws DaoException { + if (updateMode) { + DaoGeneticAlteration.getInstance().deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); + values = updateValues(geneticEntityId, values); + } + return DaoGeneticAlteration.getInstance().addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values) > 0; + } + + //TODO duplicate + private String[] updateValues(int geneticEntityId, String[] values) { + Map sampleIdToValue = ArrayUtil.zip(orderedImportedSampleList.toArray(new Integer[0]), values); + String[] updatedSampleValues = new String[orderedSampleList.size()]; + for (int i = 0; i < orderedSampleList.size(); i++) { + updatedSampleValues[i] = ""; + int sampleId = orderedSampleList.get(i); + if (geneticAlterationMap.containsKey(geneticEntityId)) { + HashMap savedSampleIdToValue = geneticAlterationMap.get(geneticEntityId); + updatedSampleValues[i] = savedSampleIdToValue.containsKey(sampleId) ? savedSampleIdToValue.remove(sampleId): ""; + if (savedSampleIdToValue.isEmpty()) { + geneticAlterationMap.remove(geneticEntityId); + } + } + if (sampleIdToValue.containsKey(sampleId)) { + updatedSampleValues[i] = sampleIdToValue.get(sampleId); + } + } + return updatedSampleValues; + } + + //TODO duplicate + private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { + if (updateMode) { + // Expand remaining genetic entity id rows that were not mentioned in the file + new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { + try { + String[] values = new String[orderedImportedSampleList.size()]; + Arrays.fill(values, ""); + saveValues(geneticEntityId, values); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }); + } } /** diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index 193b7cf5..eefffb92 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -138,16 +138,17 @@ public void run() { } } else if( geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION - && DISCRETE_LONG.name().equals(geneticProfile.getDatatype()) + && DISCRETE_LONG.name().equals(geneticProfile.getOtherMetaDataField("datatype")) ) { Set namespaces = GeneticProfileReader.getNamespaces(descriptorFile); ImportCnaDiscreteLongData importer = new ImportCnaDiscreteLongData( - dataFile, - geneticProfile.getGeneticProfileId(), + dataFile, + geneticProfile.getGeneticProfileId(), genePanel, daoGene, daoGeneticAlteration, - namespaces + namespaces, + overwriteExisting ); importer.importData(); } else { diff --git a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java index ab862756..7d3bb6cc 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java +++ b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java @@ -48,6 +48,9 @@ import org.mskcc.cbio.portal.model.GeneticProfileLink; import org.mskcc.cbio.portal.scripts.TrimmedProperties; +import static org.cbioportal.model.MolecularProfile.DataType.DISCRETE; +import static org.cbioportal.model.MolecularProfile.ImportType.DISCRETE_LONG; + /** * Prepare a GeneticProfile for having its data loaded. * @@ -83,10 +86,18 @@ public static GeneticProfile loadGeneticProfile(File file) throws IOException, D + existingGeneticProfile.getStableId() + ") but different genetic alteration type: " + existingGeneticProfile.getGeneticProfileId()); } - if (!existingGeneticProfile.getDatatype().equals(geneticProfile.getDatatype())) { - throw new IllegalStateException("genetic_profile record found with same Stable ID (" - + existingGeneticProfile.getStableId() + ") but different data type: " - + existingGeneticProfile.getDatatype()); + if (DISCRETE_LONG.name().equals(geneticProfile.getDatatype())) { + if (!Set.of(DISCRETE_LONG.name(), DISCRETE.name()).contains(existingGeneticProfile.getDatatype())) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but unsupported data type: " + + existingGeneticProfile.getDatatype()); + } + } else { + if (!existingGeneticProfile.getDatatype().equals(geneticProfile.getDatatype())) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different data type: " + + existingGeneticProfile.getDatatype()); + } } if (geneticProfile.getCancerStudyId() != existingGeneticProfile.getCancerStudyId()) { throw new IllegalStateException("genetic_profile record found with same Stable ID (" diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java index ad3ebd55..f2bc2b1e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java @@ -18,8 +18,12 @@ package org.mskcc.cbio.portal.integrationTest.incremental; import org.cbioportal.model.CNA; +import org.junit.After; +import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoCnaEvent; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; @@ -27,14 +31,15 @@ import org.mskcc.cbio.portal.model.CnaEvent; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportProfileData; -import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; -import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; -import org.springframework.transaction.annotation.Transactional; +import org.springframework.test.context.TestContextManager; +import org.springframework.transaction.PlatformTransactionManager; +import org.springframework.transaction.TransactionStatus; +import org.springframework.transaction.support.DefaultTransactionDefinition; import java.io.File; -import java.io.IOException; import java.util.Arrays; +import java.util.Collection; import java.util.HashMap; import java.util.HashSet; import java.util.List; @@ -53,40 +58,51 @@ * @author Ruslan Forostianov * @author Pieter Lukasse */ -@RunWith(SpringJUnit4ClassRunner.class) +@RunWith(Parameterized.class) @ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) -@Rollback -@Transactional public class TestIncrementalCopyNumberAlterationImport { + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ATM + final long absentGeneEntrezId = 472l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + private final String metaFile; + private final String dataFile; + + { beforeEntrezIds.add(absentGeneEntrezId); } + + // stable_id: TCGA-XX-0800 + final int newSampleId = 15; + // stable_id: TCGA-A1-A0SO + final int updateSampleId = 12; + final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + { beforeSampleIds.add(updateSampleId); } + + final Set afterSampleIds = new HashSet<>(beforeSampleIds); + { afterSampleIds.add(newSampleId); } + + @Parameterized.Parameters + public static Collection primeNumbers() { + return Arrays.asList(new Object[][] { + { "meta_cna_discrete.txt", "data_cna_discrete.txt" }, + { "meta_cna_discrete_long.txt", "data_cna_discrete_long.txt" }, + }); + } + + public TestIncrementalCopyNumberAlterationImport(String metaFile, String dataFile) { + this.metaFile = metaFile; + this.dataFile = dataFile; + } + /** - * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE (gistic) + * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE */ @Test - public void testDiscreteCNA() throws DaoException, IOException { - /** - * Prior checks - */ - // Hugo_Symbol: CDK1 - final long newGeneEntrezId = 983l; - // Gene that is part of the platform, but absent during the incremental upload - // Hugo_Symbol: ATM - final long absentGeneEntrezId = 472l; - final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); - final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); - beforeEntrezIds.add(absentGeneEntrezId); - - // stable_id: TCGA-XX-0800 - final int newSampleId = 15; - // stable_id: TCGA-A1-A0SO - final int updateSampleId = 12; - final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); - final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); - beforeSampleIds.add(updateSampleId); - - final Set afterSampleIds = new HashSet<>(beforeSampleIds); - afterSampleIds.add(newSampleId); - + public void testDiscreteCNA() throws DaoException { GeneticProfile discreteCNAProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); assertNotNull(discreteCNAProfile); HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); @@ -102,8 +118,8 @@ public void testDiscreteCNA() throws DaoException, IOException { assertEquals(beforeCnaEventsSampleIds, beforeSampleIdToSampleCnaEvents.keySet()); File dataFolder = new File("src/test/resources/incremental/copy_number_alteration/"); - File metaFile = new File(dataFolder, "meta_cna_discrete.txt"); - File dataFile = new File(dataFolder, "data_cna_discrete.txt"); + File metaFile = new File(dataFolder, this.metaFile); + File dataFile = new File(dataFolder, this.dataFile); /** * Test @@ -174,4 +190,22 @@ public void testDiscreteCNA() throws DaoException, IOException { updatedSampleEntrezGeneIdToCnaAlteration); } + private TestContextManager testContextManager; + + private PlatformTransactionManager transactionManager; + + private TransactionStatus transactionStatus; + @Before + public void before() throws Exception { + this.testContextManager = new TestContextManager(getClass()); + this.testContextManager.prepareTestInstance(this); + this.transactionManager = this.testContextManager.getTestContext().getApplicationContext().getBean(PlatformTransactionManager.class); + this.transactionStatus = transactionManager.getTransaction(new DefaultTransactionDefinition()); + DaoCancerStudy.reCacheAll(); + } + + @After + public void after() { + this.transactionManager.rollback(transactionStatus); + } } diff --git a/src/test/resources/incremental/copy_number_alteration/data_cna_discrete_long.txt b/src/test/resources/incremental/copy_number_alteration/data_cna_discrete_long.txt new file mode 100644 index 00000000..88e406c4 --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/data_cna_discrete_long.txt @@ -0,0 +1,37 @@ +Hugo_Symbol Entrez_Gene_Id Sample_Id Value cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +AKT3 10000 TCGA-XX-0800-01 0 +AKT3 10000 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 +AKT3 10000 TCGA-A1-A0SO-01 -2 +AKT1 207 TCGA-XX-0800-01 -1 +AKT1 207 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +AKT1 207 TCGA-A1-A0SO-01 2 +# All after the pipe has to be removed +AKT2|TEST 208 TCGA-XX-0800-01 -2 +AKT2|TEST 208 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +AKT2|TEST 208 TCGA-A1-A0SO-01 -1 Putative_Driver Test driver Class 1 Class annotation +HRAS 3265 TCGA-XX-0800-01 2 +HRAS 3265 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +HRAS 3265 TCGA-A1-A0SO-01 0 +KRAS 3845 TCGA-XX-0800-01 0 Class 2 Class annotation +KRAS 3845 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 +KRAS 3845 TCGA-A1-A0SO-01 2 Putative_Passenger Test passenger Class 2 Class annotation +# This gene absent in this file, but it's still part of the profile and has to be updated +#ATM 472 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 TCGA-XX-0800-01 -2 + 4893 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 + 4893 TCGA-A1-A0SO-01 -1 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 TCGA-XX-0800-01 2 +BRCA1 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +BRCA1 TCGA-A1-A0SO-01 0 +BRAF 673 TCGA-XX-0800-01 2 +BRAF 673 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 +BRAF 673 TCGA-A1-A0SO-01 -2 +BRCA2 675 TCGA-XX-0800-01 -1.5 +BRCA2 675 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL 2 +BRCA2 675 TCGA-A1-A0SO-01 0 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 TCGA-XX-0800-01 -2 Putative_Driver +CDK1 983 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL -2 +CDK1 983 TCGA-A1-A0SO-01 2 Putative_Passenger Test passenger diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt new file mode 100644 index 00000000..4155601b --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt @@ -0,0 +1,9 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE_LONG +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete_long.txt +namespaces: CustomNamespace From d7e1918f53ca695b7e5a28f9c7f9c323e82a65bd Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 22 May 2024 11:18:25 +0200 Subject: [PATCH 077/130] Add data type sanity check for tsv uploded --- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index cb613b08..43782731 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -204,6 +204,11 @@ private void doImportData() throws IOException, DaoException { && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY && headerParts[0].equalsIgnoreCase("ENTITY_STABLE_ID"); + long typesDetected = List.of(isDiscretizedCnaProfile, isRppaProfile, isGsvaProfile, isGenericAssayProfile).stream().filter(Boolean::booleanValue).count(); + if (typesDetected > 1) { + throw new IllegalStateException("More then one data type is detected."); + } + int numRecordsToAdd = 0; int samplesSkipped = 0; try { From ee183e6de754ee19bc087f79ffb9e1c38dc8551d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 22 May 2024 14:30:46 +0200 Subject: [PATCH 078/130] Move storing/dedup logic of genetic alteration values to importer --- .../scripts/GeneticAlterationImporter.java | 25 +++++- .../scripts/ImportCnaDiscreteLongData.java | 2 +- .../portal/scripts/ImportTabDelimData.java | 83 ++++++++++--------- 3 files changed, 70 insertions(+), 40 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index 623b3122..509a6447 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -5,7 +5,8 @@ import org.mskcc.cbio.portal.model.CanonicalGene; import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.util.*; +import java.util.HashSet; +import java.util.Set; import static java.lang.String.format; @@ -13,6 +14,7 @@ public class GeneticAlterationImporter { private final int geneticProfileId; private Set importSetOfGenes = new HashSet<>(); + private Set importSetOfGeneticEntityIds = new HashSet<>(); private DaoGeneticAlteration daoGeneticAlteration; public GeneticAlterationImporter( @@ -55,6 +57,27 @@ public boolean store( } } + /** + * Universal method that stores values for different genetic entities + * @param geneticEntityId + * @param values + * @return true if entity has been stored, false - if entity already existed + * @throws DaoException + */ + public boolean store( + int geneticEntityId, + String[] values + ) throws DaoException { + if (importSetOfGeneticEntityIds.add(geneticEntityId)) { + daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfileId, geneticEntityId, values); + return true; + } + else { + ProgressMonitor.logWarning("Data for genetic entity with id " + geneticEntityId + " already imported from file. Record will be skipped."); + return false; + } + } + public boolean isImportedAlready(CanonicalGene gene) { return importSetOfGenes.contains(gene.getEntrezGeneId()); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 76954aa4..b9817691 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -295,7 +295,7 @@ private boolean saveValues(int geneticEntityId, String[] values) throws DaoExcep DaoGeneticAlteration.getInstance().deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); values = updateValues(geneticEntityId, values); } - return DaoGeneticAlteration.getInstance().addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values) > 0; + return geneticAlterationGeneImporter.store(geneticEntityId, values); } //TODO duplicate diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 43782731..2a1391a5 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -32,19 +32,52 @@ package org.mskcc.cbio.portal.scripts; -import java.io.*; -import java.util.*; +import org.apache.commons.lang3.ArrayUtils; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneset; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.JdbcUtil; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.Geneset; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.ArrayUtil; +import org.mskcc.cbio.portal.util.CnaUtil; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.EntrezValidator; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.GeneticProfileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.AbstractMap; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.IntStream; import java.util.stream.Stream; -import org.apache.commons.lang3.ArrayUtils; -import org.cbioportal.model.EntityType; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - /** * Code to Import Copy Number Alteration, MRNA Expression Data, Methylation, or protein RPPA data @@ -55,7 +88,6 @@ public class ImportTabDelimData { public static final String CNA_VALUE_AMPLIFICATION = "2"; public static final String CNA_VALUE_HOMOZYGOUS_DELETION = "-2"; public static final String CNA_VALUE_PARTIAL_DELETION = "-1.5"; - private HashSet importedGeneticEntitySet = new HashSet<>(); private File dataFile; private String targetLine; private int geneticProfileId; @@ -737,7 +769,7 @@ private boolean saveValues(int geneticEntityId, String[] values) throws DaoExcep daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); values = updateValues(geneticEntityId, values); } - return daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values) > 0; + return geneticAlterationImporter.store(geneticEntityId, values); } private String[] updateValues(int geneticEntityId, String[] values) { @@ -840,10 +872,9 @@ private List composeCnaEventsToAdd(String[] values, long entrezGeneId) private boolean saveGenesetLine(String[] values, String genesetId) throws DaoException { boolean storedRecord = false; - Geneset geneset = DaoGeneset.getGenesetByExternalId(genesetId); if (geneset != null) { - storedRecord = storeGeneticEntityGeneticAlterations(values, geneset.getGeneticEntityId(), EntityType.GENESET, geneset.getExternalId()); + storedRecord = saveValues(geneset.getGeneticEntityId(), values); } else { ProgressMonitor.logWarning("Geneset " + genesetId + " not found in DB. Record will be skipped."); @@ -854,7 +885,7 @@ private boolean saveGenesetLine(String[] values, String genesetId) throws DaoExc /** * Parses line for generic assay profile record and stores record in 'genetic_alteration' table. */ - private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map genericAssayStableIdToEntityIdMap) { + private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map genericAssayStableIdToEntityIdMap) throws DaoException { boolean recordIsStored = false; @@ -863,36 +894,12 @@ private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map if (entityId == null) { ProgressMonitor.logWarning("Generic Assay entity " + genericAssayId + " not found in DB. Record will be skipped."); } else { - recordIsStored = storeGeneticEntityGeneticAlterations(values, entityId, EntityType.GENERIC_ASSAY, genericAssayId); + recordIsStored = saveValues(entityId, values); } return recordIsStored; } - /** - * Stores genetic alteration data for a genetic entity. - * @param values - * @param geneticEntityId - internal id for genetic entity - * @param geneticEntityType - "GENE", "GENESET", "PHOSPHOPROTEIN" - * @param geneticEntityName - hugo symbol for "GENE", external id for "GENESET", phospho gene name for "PHOSPHOPROTEIN" - * @return boolean indicating if record was stored successfully or not - */ - private boolean storeGeneticEntityGeneticAlterations(String[] values, Integer geneticEntityId, EntityType geneticEntityType, String geneticEntityName) { - try { - if (importedGeneticEntitySet.add(geneticEntityId)) { - return saveValues(geneticEntityId, values); - } - else { - ProgressMonitor.logWarning("Data for genetic entity " + geneticEntityName - + " [" + geneticEntityType + "] already imported from file. Record will be skipped."); - return false; - } - } - catch (Exception ex) { - throw new RuntimeException("Aborted: Error found for row starting with " + geneticEntityName + ": " + ex.getMessage()); - } - } - /** * Tries to parse the genes and look them up in DaoGeneOptimized * From 697631fd77ef66618e982ec7832ecbb78a376180 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 22 May 2024 17:40:07 +0200 Subject: [PATCH 079/130] Move all inc. upload logic for tab delim. data types to GeneticAlterationImporter --- .../scripts/GeneticAlterationImporter.java | 84 +--------- .../GeneticAlterationImporterImpl.java | 105 ++++++++++++ .../GeneticAlterationIncrementalImporter.java | 111 ++++++++++++ .../scripts/ImportCnaDiscreteLongData.java | 158 ++++++------------ .../portal/scripts/ImportProfileData.java | 6 +- .../portal/scripts/ImportTabDelimData.java | 119 ++----------- .../TestIncrementalGsvaImporter.java | 1 - ...estIncrementalTabDelimDataTransaction.java | 32 +--- .../TestImportCnaDiscreteLongData.java | 15 -- .../scripts/TestImportTabDelimData.java | 10 +- 10 files changed, 306 insertions(+), 335 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporterImpl.java create mode 100644 src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index 509a6447..f0990dad 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -1,86 +1,20 @@ package org.mskcc.cbio.portal.scripts; import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.util.ProgressMonitor; -import java.util.HashSet; -import java.util.Set; +public interface GeneticAlterationImporter { -import static java.lang.String.format; + boolean store( + String[] values, + CanonicalGene gene, + String geneSymbol + ) throws DaoException; -public class GeneticAlterationImporter { - - private final int geneticProfileId; - private Set importSetOfGenes = new HashSet<>(); - private Set importSetOfGeneticEntityIds = new HashSet<>(); - private DaoGeneticAlteration daoGeneticAlteration; - - public GeneticAlterationImporter( - int geneticProfileId, - DaoGeneticAlteration daoGeneticAlteration - ) { - this.geneticProfileId = geneticProfileId; - this.daoGeneticAlteration = daoGeneticAlteration; - } - - /** - * Check that we have not already imported information regarding this gene. - * This is an important check, because a GISTIC or RAE file may contain - * multiple rows for the same gene, and we only want to import the first row. - */ - public boolean store( - String[] values, - CanonicalGene gene, - String geneSymbol - ) throws DaoException { - try { - if (importSetOfGenes.add(gene.getEntrezGeneId())) { - daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values); - return true; - } else { - String geneSymbolMessage = ""; - if (geneSymbol != null && !geneSymbol.equalsIgnoreCase(gene.getHugoGeneSymbolAllCaps())) { - geneSymbolMessage = " (given as alias in your file as: " + geneSymbol + ")"; - } - ProgressMonitor.logWarning(format( - "Gene %s (%d)%s found to be duplicated in your file. Duplicated row will be ignored!", - gene.getHugoGeneSymbolAllCaps(), - gene.getEntrezGeneId(), - geneSymbolMessage) - ); - return false; - } - } catch (Exception e) { - throw new RuntimeException("Aborted: Error found for row starting with " + geneSymbol + ": " + e.getMessage()); - } - } - - /** - * Universal method that stores values for different genetic entities - * @param geneticEntityId - * @param values - * @return true if entity has been stored, false - if entity already existed - * @throws DaoException - */ - public boolean store( + boolean store( int geneticEntityId, String[] values - ) throws DaoException { - if (importSetOfGeneticEntityIds.add(geneticEntityId)) { - daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfileId, geneticEntityId, values); - return true; - } - else { - ProgressMonitor.logWarning("Data for genetic entity with id " + geneticEntityId + " already imported from file. Record will be skipped."); - return false; - } - } - - public boolean isImportedAlready(CanonicalGene gene) { - return importSetOfGenes.contains(gene.getEntrezGeneId()); - } - + ) throws DaoException; + void finalise(); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporterImpl.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporterImpl.java new file mode 100644 index 00000000..7589d3f8 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporterImpl.java @@ -0,0 +1,105 @@ +package org.mskcc.cbio.portal.scripts; + +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ProgressMonitor; + +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import static java.lang.String.format; + +public class GeneticAlterationImporterImpl implements GeneticAlterationImporter { + + private final int geneticProfileId; + private final Set importSetOfGenes = new HashSet<>(); + private final Set importSetOfGeneticEntityIds = new HashSet<>(); + + private final List orderedSampleList; + + private final DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); + + public GeneticAlterationImporterImpl( + int geneticProfileId, + List orderedSampleList + ) throws DaoException { + this.geneticProfileId = geneticProfileId; + this.orderedSampleList = orderedSampleList; + storeOrderedSampleList(); + } + + private void storeOrderedSampleList() throws DaoException { + int rowCount = DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + if (rowCount < 1) { + throw new IllegalStateException("Failed to store the ordered sample list."); + } + } + + /** + * Check that we have not already imported information regarding this gene. + * This is an important check, because a GISTIC or RAE file may contain + * multiple rows for the same gene, and we only want to import the first row. + */ + @Override + public boolean store( + String[] values, + CanonicalGene gene, + String geneSymbol + ) throws DaoException { + ensureNumberOfValuesIsCorrect(values.length); + if (importSetOfGenes.add(gene.getEntrezGeneId())) { + daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values); + return true; + } + String geneSymbolMessage = ""; + if (geneSymbol != null && !geneSymbol.equalsIgnoreCase(gene.getHugoGeneSymbolAllCaps())) { + geneSymbolMessage = " (given as alias in your file as: " + geneSymbol + ")"; + } + ProgressMonitor.logWarning(format( + "Gene %s (%d)%s found to be duplicated in your file. Duplicated row will be ignored!", + gene.getHugoGeneSymbolAllCaps(), + gene.getEntrezGeneId(), + geneSymbolMessage) + ); + return false; + } + + + /** + * Universal method that stores values for different genetic entities + * @param geneticEntityId + * @param values + * @return true if entity has been stored, false - if entity already existed + * @throws DaoException + */ + @Override + public boolean store( + int geneticEntityId, + String[] values + ) throws DaoException { + ensureNumberOfValuesIsCorrect(values.length); + if (importSetOfGeneticEntityIds.add(geneticEntityId)) { + daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfileId, geneticEntityId, values); + return true; + } + ProgressMonitor.logWarning("Data for genetic entity with id " + geneticEntityId + " already imported from file. Record will be skipped."); + return false; + } + + private void ensureNumberOfValuesIsCorrect(int valuesNumber) { + if (valuesNumber != orderedSampleList.size()) { + throw new IllegalArgumentException("There has to be " + orderedSampleList.size() + " values, but only " + valuesNumber+ " has passed."); + } + } + + public boolean isImportedAlready(CanonicalGene gene) { + return importSetOfGenes.contains(gene.getEntrezGeneId()); + } + + + @Override + public void finalise() { } +} diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java new file mode 100644 index 00000000..653dacb1 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -0,0 +1,111 @@ +package org.mskcc.cbio.portal.scripts; + +import org.jetbrains.annotations.NotNull; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ArrayUtil; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +public class GeneticAlterationIncrementalImporter implements GeneticAlterationImporter { + + private final GeneticAlterationImporterImpl geneticAlterationImporter; + private final int geneticProfileId; + private final List fileOrderedSampleList; + private final List extendedOrderedSampleList; + private final DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); + private final HashMap> geneticAlterationMap; + + public GeneticAlterationIncrementalImporter( + int geneticProfileId, + List fileOrderedSampleList + ) throws DaoException { + this.geneticProfileId = geneticProfileId; + this.geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, null); + this.fileOrderedSampleList = fileOrderedSampleList; + + ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(this.geneticProfileId); + int initialOrderSampleListSize = savedOrderedSampleList.size(); + geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { + if (sampleToValue.size() != initialOrderSampleListSize) { + throw new IllegalStateException("Number of samples (" + + sampleToValue.size() + ") for genetic entity with id " + + geneticEntityId + " does not match with the number in the inital sample list (" + + initialOrderSampleListSize + ")."); + } + }); + // add all new sample ids at the end + this.extendedOrderedSampleList = new ArrayList<>(savedOrderedSampleList); + List newSampleIds = this.fileOrderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); + this.extendedOrderedSampleList.addAll(newSampleIds); + DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(this.geneticProfileId); + this.geneticAlterationImporter = new GeneticAlterationImporterImpl(geneticProfileId, extendedOrderedSampleList); + } + + @Override + public boolean store(String[] values, CanonicalGene gene, String geneSymbol) throws DaoException { + int geneticEntityId = gene.getGeneticEntityId(); + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfileId, geneticEntityId); + String[] expandedValues = extendValues(geneticEntityId, values); + return geneticAlterationImporter.store(expandedValues, gene, geneSymbol); + } + + @Override + public boolean store(int geneticEntityId, String[] values) throws DaoException { + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfileId, geneticEntityId); + String[] expandedValues = extendValues(geneticEntityId, values); + return geneticAlterationImporter.store(geneticEntityId, expandedValues); + } + + @Override + public void finalise() { + expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); + geneticAlterationImporter.finalise(); + } + + @NotNull + private String[] extendValues(int geneticEntityId, String[] values) { + Map sampleIdToValue = mapWithFileOrderedSampleList(values); + String[] updatedSampleValues = new String[extendedOrderedSampleList.size()]; + for (int i = 0; i < extendedOrderedSampleList.size(); i++) { + updatedSampleValues[i] = ""; + int sampleId = extendedOrderedSampleList.get(i); + if (geneticAlterationMap.containsKey(geneticEntityId)) { + HashMap savedSampleIdToValue = geneticAlterationMap.get(geneticEntityId); + updatedSampleValues[i] = savedSampleIdToValue.containsKey(sampleId) ? savedSampleIdToValue.remove(sampleId): ""; + if (savedSampleIdToValue.isEmpty()) { + geneticAlterationMap.remove(geneticEntityId); + } + } + if (sampleIdToValue.containsKey(sampleId)) { + updatedSampleValues[i] = sampleIdToValue.get(sampleId); + } + } + return updatedSampleValues; + } + + @NotNull + private Map mapWithFileOrderedSampleList(String[] values) { + return ArrayUtil.zip(fileOrderedSampleList.toArray(new Integer[0]), values); + } + + private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { + // Expand remaining genetic entity id rows that were not mentioned in the file + new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { + try { + String[] values = new String[fileOrderedSampleList.size()]; + Arrays.fill(values, ""); + this.store(geneticEntityId, values); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }); + } +} diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index b9817691..5590e57a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -22,19 +22,43 @@ */ package org.mskcc.cbio.portal.scripts; -import com.google.common.base.*; -import com.google.common.collect.*; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.util.*; - -import java.io.*; +import com.google.common.base.Strings; +import com.google.common.collect.HashBasedTable; +import com.google.common.collect.Table; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.JdbcUtil; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.CnaUtil; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; +import org.mskcc.cbio.portal.util.GeneticProfileUtil; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; import java.util.Optional; -import java.util.*; -import java.util.stream.*; +import java.util.Set; +import java.util.stream.Collectors; -import static com.google.common.collect.Lists.*; -import static java.lang.String.*; +import static com.google.common.collect.Lists.newArrayList; +import static java.lang.String.format; import static org.cbioportal.model.MolecularProfile.DataType.DISCRETE; import static org.cbioportal.model.MolecularProfile.ImportType.DISCRETE_LONG; @@ -42,7 +66,7 @@ public class ImportCnaDiscreteLongData { private final File cnaFile; private final int geneticProfileId; - private final GeneticAlterationImporter geneticAlterationGeneImporter; + private GeneticAlterationImporter geneticAlterationGeneImporter; private String genePanel; private final DaoGeneOptimized daoGene; private CnaUtil cnaUtil; @@ -55,7 +79,6 @@ public class ImportCnaDiscreteLongData { private GeneticProfile geneticProfile; private final ArrayList sampleIdGeneticProfileIds = new ArrayList<>(); - private ArrayList orderedImportedSampleList; private ArrayList orderedSampleList; private HashMap> geneticAlterationMap; @@ -65,7 +88,6 @@ public ImportCnaDiscreteLongData( int geneticProfileId, String genePanel, DaoGeneOptimized daoGene, - DaoGeneticAlteration daoGeneticAlteration, Set namespaces, boolean updateMode ) { @@ -81,7 +103,6 @@ public ImportCnaDiscreteLongData( } this.genePanel = genePanel; this.daoGene = daoGene; - this.geneticAlterationGeneImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); this.updateMode = updateMode; } @@ -90,13 +111,23 @@ public ImportCnaDiscreteLongData( int geneticProfileId, String genePanel, DaoGeneOptimized daoGene, - DaoGeneticAlteration daoGeneticAlteration, Set namespaces ) { - this(cnaFile, geneticProfileId, genePanel, daoGene, daoGeneticAlteration, namespaces, false); + this(cnaFile, geneticProfileId, genePanel, daoGene, namespaces, false); + } + public void importData() { + JdbcUtil.getTransactionTemplate().execute(status -> { + try { + doImportData(); + } catch (Throwable e) { + status.setRollbackOnly(); + throw new RuntimeException(e); + } + return null; + }); } - public void importData() throws Exception { + private void doImportData() throws Exception { FileReader reader = new FileReader(this.cnaFile); BufferedReader buf = new BufferedReader(reader); @@ -128,21 +159,8 @@ public void importData() throws Exception { } orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); - if (updateMode) { - ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(geneticProfileId); - int initialOrderSampleListSize = savedOrderedSampleList.size(); - checkSamplesInDataEqualTo(initialOrderSampleListSize); - // add all new sample ids at the end - ArrayList extendedSampleList = new ArrayList<>(savedOrderedSampleList); - List newSampleIds = orderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); - extendedSampleList.addAll(newSampleIds); - orderedImportedSampleList = orderedSampleList; - orderedSampleList = extendedSampleList; - - - DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(geneticProfileId); - } - DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + this.geneticAlterationGeneImporter = updateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) + : new GeneticAlterationImporterImpl(geneticProfileId, orderedSampleList); for (Long entrezId : toImport.eventsTable.rowKeySet()) { boolean added = storeGeneticAlterations(toImport, entrezId); @@ -159,7 +177,7 @@ public void importData() throws Exception { ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + getSamplesSkipped()); buf.close(); - expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); + geneticAlterationGeneImporter.finalise(); MySQLbulkLoader.flushAll(); } @@ -224,7 +242,7 @@ private void storeCnaEvents(CnaImportData toImport, Long entrezId) throws DaoExc .map(v -> v.cnaEvent) .collect(Collectors.toList()); if (updateMode) { - DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedImportedSampleList); + DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedSampleList); } CnaUtil.storeCnaEvents(existingCnaEvents, events); } @@ -264,75 +282,7 @@ private boolean storeGeneticAlterations(CnaImportData toImport, Long entrezId) t ? gene.get().getHugoGeneSymbolAllCaps() : "" + entrezId; - return saveValues(gene.get(), values, geneSymbol); - } - - //TODO duplicate - private void checkSamplesInDataEqualTo(int initialOrderSampleListSize) { - geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { - if (sampleToValue.size() != initialOrderSampleListSize) { - throw new IllegalStateException("Number of samples (" - + sampleToValue.size() + ") for genetic entity with id " - + geneticEntityId + " does not match with the number in the inital sample list (" - + initialOrderSampleListSize + ")."); - } - }); - } - - //TODO duplicate - private boolean saveValues(CanonicalGene canonicalGene, String[] values, String geneSymbol) throws DaoException { - if (updateMode) { - values = updateValues(canonicalGene.getGeneticEntityId(), values); - if (!geneticAlterationGeneImporter.isImportedAlready(canonicalGene)) { - DaoGeneticAlteration.getInstance().deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); - } - } - return geneticAlterationGeneImporter.store(values, canonicalGene, geneSymbol); - } - //TODO duplicate - private boolean saveValues(int geneticEntityId, String[] values) throws DaoException { - if (updateMode) { - DaoGeneticAlteration.getInstance().deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); - values = updateValues(geneticEntityId, values); - } - return geneticAlterationGeneImporter.store(geneticEntityId, values); - } - - //TODO duplicate - private String[] updateValues(int geneticEntityId, String[] values) { - Map sampleIdToValue = ArrayUtil.zip(orderedImportedSampleList.toArray(new Integer[0]), values); - String[] updatedSampleValues = new String[orderedSampleList.size()]; - for (int i = 0; i < orderedSampleList.size(); i++) { - updatedSampleValues[i] = ""; - int sampleId = orderedSampleList.get(i); - if (geneticAlterationMap.containsKey(geneticEntityId)) { - HashMap savedSampleIdToValue = geneticAlterationMap.get(geneticEntityId); - updatedSampleValues[i] = savedSampleIdToValue.containsKey(sampleId) ? savedSampleIdToValue.remove(sampleId): ""; - if (savedSampleIdToValue.isEmpty()) { - geneticAlterationMap.remove(geneticEntityId); - } - } - if (sampleIdToValue.containsKey(sampleId)) { - updatedSampleValues[i] = sampleIdToValue.get(sampleId); - } - } - return updatedSampleValues; - } - - //TODO duplicate - private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { - if (updateMode) { - // Expand remaining genetic entity id rows that were not mentioned in the file - new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { - try { - String[] values = new String[orderedImportedSampleList.size()]; - Arrays.fill(values, ""); - saveValues(geneticEntityId, values); - } catch (DaoException e) { - throw new RuntimeException(e); - } - }); - } + return geneticAlterationGeneImporter.store(values, gene.get(), geneSymbol); } /** diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index eefffb92..a02e7f4a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -58,7 +58,6 @@ public void run() { DaoGeneOptimized daoGene; DaoGeneticAlteration daoGeneticAlteration; daoGene = DaoGeneOptimized.getInstance(); - daoGeneticAlteration = DaoGeneticAlteration.getInstance(); try { // Parse arguments @@ -132,7 +131,7 @@ public void run() { genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), overwriteExisting, - daoGeneticAlteration, daoGene + daoGene ); genericAssayProfileImporter.importData(); } @@ -146,7 +145,6 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, daoGene, - daoGeneticAlteration, namespaces, overwriteExisting ); @@ -158,7 +156,7 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, overwriteExisting, - daoGeneticAlteration, daoGene + daoGene ); String pdAnnotationsFilename = geneticProfile.getOtherMetaDataField("pd_annotations_filename"); if (pdAnnotationsFilename != null && !"".equals(pdAnnotationsFilename)) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 2a1391a5..f1bd0bb5 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -37,7 +37,6 @@ import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneset; -import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; import org.mskcc.cbio.portal.dao.DaoSample; @@ -99,16 +98,13 @@ public class ImportTabDelimData { private String genericEntityProperties; private File pdAnnotationsFile; private Map, Map> pdAnnotations; - private final GeneticAlterationImporter geneticAlterationImporter; + private GeneticAlterationImporter geneticAlterationImporter; private int numLines; - private DaoGeneticAlteration daoGeneticAlteration; private DaoGeneOptimized daoGene; private boolean updateMode; - private HashMap> geneticAlterationMap; - private ArrayList orderedImportedSampleList; private ArrayList orderedSampleList; /** @@ -131,10 +127,9 @@ public ImportTabDelimData( String genePanel, String genericEntityProperties, boolean updateMode, - DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { - this(dataFile, targetLine, geneticProfileId, genePanel, updateMode, daoGeneticAlteration, daoGene); + this(dataFile, targetLine, geneticProfileId, genePanel, updateMode, daoGene); this.genericEntityProperties = genericEntityProperties; } @@ -155,10 +150,9 @@ public ImportTabDelimData( int geneticProfileId, String genePanel, boolean updateMode, - DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { - this(dataFile, geneticProfileId, genePanel, updateMode, daoGeneticAlteration, daoGene); + this(dataFile, geneticProfileId, genePanel, updateMode, daoGene); this.targetLine = targetLine; } @@ -174,15 +168,12 @@ public ImportTabDelimData( int geneticProfileId, String genePanel, boolean updateMode, - DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; this.genePanel = genePanel; this.updateMode = updateMode; - this.daoGeneticAlteration = daoGeneticAlteration; - this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); this.daoGene = daoGene; this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); if (this.updateMode @@ -213,9 +204,6 @@ private void doImportData() throws IOException, DaoException { } catch (IOException e) { throw new RuntimeException(e); } - if (updateMode) { - geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfile.getGeneticProfileId(), null); - } ProgressMonitor.setMaxValue(numLines); FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); @@ -316,7 +304,8 @@ private void doImportData() throws IOException, DaoException { } ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines - 1)); - saveOrderedSampleList(); + this.geneticAlterationImporter = updateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) + : new GeneticAlterationImporterImpl(geneticProfileId, orderedSampleList); //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); @@ -408,7 +397,7 @@ private void doImportData() throws IOException, DaoException { line = buf.readLine(); } - expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); + geneticAlterationImporter.finalise(); if (MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); } @@ -430,21 +419,6 @@ private void doImportData() throws IOException, DaoException { } } - private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { - if (updateMode) { - // Expand remaining genetic entity id rows that were not mentioned in the file - new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { - try { - String[] values = new String[orderedImportedSampleList.size()]; - Arrays.fill(values, ""); - saveValues(geneticEntityId, values); - } catch (DaoException e) { - throw new RuntimeException(e); - } - }); - } - } - private void ensureSampleGeneticProfile(Sample sample) throws DaoException { if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); @@ -455,35 +429,6 @@ private void ensureSampleGeneticProfile(Sample sample) throws DaoException { } } - private void saveOrderedSampleList() throws DaoException { - if (updateMode) { - ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(geneticProfileId); - int initialOrderSampleListSize = savedOrderedSampleList.size(); - checkSamplesInDataEqualTo(initialOrderSampleListSize); - // add all new sample ids at the end - ArrayList extendedSampleList = new ArrayList<>(savedOrderedSampleList); - List newSampleIds = orderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); - extendedSampleList.addAll(newSampleIds); - orderedImportedSampleList = orderedSampleList; - orderedSampleList = extendedSampleList; - - - DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(geneticProfileId); - } - DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); - } - - private void checkSamplesInDataEqualTo(int initialOrderSampleListSize) { - geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { - if (sampleToValue.size() != initialOrderSampleListSize) { - throw new IllegalStateException("Number of samples (" - + sampleToValue.size() + ") for genetic entity with id " - + geneticEntityId + " does not match with the number in the inital sample list (" - + initialOrderSampleListSize + ")."); - } - }); - } - private Map, Map> readPdAnnotations(File pdAnnotationsFile) { Map, Map> pdAnnotations = new HashMap<>(); BufferedReader reader; @@ -711,7 +656,7 @@ private boolean saveLine(String[] values, if (!microRNAGenes.isEmpty()) { // for micro rna, duplicate the data for (CanonicalGene gene : microRNAGenes) { - if (this.saveValues(gene, values, geneSymbol)) { + if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { recordStored = true; } } @@ -731,11 +676,11 @@ private boolean saveLine(String[] values, // none of the matched genes are type "miRNA" if (genes.size() == 1) { // Store all values per gene: - recordStored = this.saveValues(genes.get(0), values, geneSymbol); + recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored && isDiscretizedCnaProfile) { if (updateMode) { - DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedImportedSampleList); + DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedSampleList); } long entrezGeneId = genes.get(0).getEntrezGeneId(); CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, entrezGeneId)); @@ -755,46 +700,9 @@ private boolean saveLine(String[] values, return recordStored; } - private boolean saveValues(CanonicalGene canonicalGene, String[] values, String geneSymbol) throws DaoException { - if (updateMode) { - values = updateValues(canonicalGene.getGeneticEntityId(), values); - if (!geneticAlterationImporter.isImportedAlready(canonicalGene)) { - daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); - } - } - return geneticAlterationImporter.store(values, canonicalGene, geneSymbol); - } - private boolean saveValues(int geneticEntityId, String[] values) throws DaoException { - if (updateMode) { - daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); - values = updateValues(geneticEntityId, values); - } - return geneticAlterationImporter.store(geneticEntityId, values); - } - - private String[] updateValues(int geneticEntityId, String[] values) { - Map sampleIdToValue = ArrayUtil.zip(orderedImportedSampleList.toArray(new Integer[0]), values); - String[] updatedSampleValues = new String[orderedSampleList.size()]; - for (int i = 0; i < orderedSampleList.size(); i++) { - updatedSampleValues[i] = ""; - int sampleId = orderedSampleList.get(i); - if (geneticAlterationMap.containsKey(geneticEntityId)) { - HashMap savedSampleIdToValue = geneticAlterationMap.get(geneticEntityId); - updatedSampleValues[i] = savedSampleIdToValue.containsKey(sampleId) ? savedSampleIdToValue.remove(sampleId): ""; - if (savedSampleIdToValue.isEmpty()) { - geneticAlterationMap.remove(geneticEntityId); - } - } - if (sampleIdToValue.containsKey(sampleId)) { - updatedSampleValues[i] = sampleIdToValue.get(sampleId); - } - } - return updatedSampleValues; - } - private boolean saveRppaValues(String[] values, boolean recordStored, List genes, String geneSymbol) throws DaoException { for (CanonicalGene gene : genes) { - if (this.saveValues(gene, values, geneSymbol)) { + if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { recordStored = true; nrExtraRecords++; } @@ -830,9 +738,6 @@ private List parseGenes(String entrez, String geneSymbol) { } private List composeCnaEventsToAdd(String[] values, long entrezGeneId) { - if (updateMode) { - values = updateValues((int) entrezGeneId, values); - } List cnaEventsToAdd = new ArrayList(); for (int i = 0; i < values.length; i++) { @@ -874,7 +779,7 @@ private boolean saveGenesetLine(String[] values, String genesetId) throws DaoExc Geneset geneset = DaoGeneset.getGenesetByExternalId(genesetId); if (geneset != null) { - storedRecord = saveValues(geneset.getGeneticEntityId(), values); + storedRecord = this.geneticAlterationImporter.store(geneset.getGeneticEntityId(), values); } else { ProgressMonitor.logWarning("Geneset " + genesetId + " not found in DB. Record will be skipped."); @@ -894,7 +799,7 @@ private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map if (entityId == null) { ProgressMonitor.logWarning("Generic Assay entity " + genericAssayId + " not found in DB. Record will be skipped."); } else { - recordIsStored = saveValues(entityId, values); + recordIsStored = this.geneticAlterationImporter.store(entityId, values); } return recordIsStored; diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java index c629ecb4..c6fff1fa 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java @@ -74,7 +74,6 @@ public void testGsvaIsNotSupported() throws DaoException, IOException { DaoGeneticProfile.getGeneticProfileByStableId("gsva_scores").getGeneticProfileId(), null, true, - DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance())); } diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java index f149d959..d909015c 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java @@ -17,49 +17,33 @@ package org.mskcc.cbio.portal.integrationTest.incremental; -import org.cbioportal.model.CNA; -import org.jetbrains.annotations.NotNull; import org.junit.Before; import org.junit.Test; import org.junit.jupiter.api.extension.ExtendWith; import org.junit.runner.RunWith; import org.mockito.junit.jupiter.MockitoExtension; import org.mskcc.cbio.portal.dao.DaoCancerStudy; -import org.mskcc.cbio.portal.dao.DaoCnaEvent; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; -import org.mskcc.cbio.portal.dao.DaoGeneticEntity; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; -import org.mskcc.cbio.portal.model.CnaEvent; -import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.CanonicalGene; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; -import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; import org.springframework.transaction.annotation.Propagation; import org.springframework.transaction.annotation.Transactional; import java.io.File; -import java.io.IOException; -import java.util.Arrays; import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; import static org.mockito.ArgumentMatchers.anyLong; -import static org.mockito.Mockito.doNothing; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.when; /** * Tests Transaction for Incremental Import of Tab Delimited Data. @@ -81,15 +65,15 @@ public class TestIncrementalTabDelimDataTransaction { public void testTransaction() throws Exception { GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFolder = new File("src/test/resources/incremental/mrna_expression/"); File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - DaoGeneticAlteration mockedDao = mock(DaoGeneticAlteration.class); + DaoGeneOptimized mockedDao = mock(DaoGeneOptimized.class); - doNothing().doNothing().doThrow(new DaoException("Simulated dao error")) - .when(mockedDao).deleteAllRecordsInGeneticProfile(anyLong(), anyLong()); + when(mockedDao.getGene(anyLong())) + .thenThrow(new RuntimeException("Simulated error")); /** * Test */ @@ -98,10 +82,10 @@ public void testTransaction() throws Exception { mrnaProfile.getGeneticProfileId(), null, true, - mockedDao, - DaoGeneOptimized.getInstance()).importData(); + mockedDao).importData(); fail("Import has to fail"); } catch (RuntimeException runtimeException) { + assertTrue(runtimeException.getMessage(), runtimeException.getMessage().contains("Simulated error")); assertTrue(true); } diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java index 916a16cd..27eb111e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java @@ -108,7 +108,6 @@ public void testImportCnaDiscreteLongDataAddsSamples() throws Exception { geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test new samples are added: @@ -134,7 +133,6 @@ public void testImportCnaDiscreteLongDataAddsCnaEvents() throws Exception { geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces ).importData(); @@ -188,7 +186,6 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterations() throws Excepti geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test genetic alterations are added for all genes: @@ -213,7 +210,6 @@ public void testImportCnaDiscreteLongDataAddsMissingGeneticAlterations() throws geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test genetic alteration are added of non-cna event: @@ -241,7 +237,6 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsAndProfileSamples geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test order of genetic alteration values: @@ -268,7 +263,6 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithoutEntrezButWithHugo( geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test order of genetic alteration values: @@ -291,7 +285,6 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithWrongEntrezAndCorrect geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test order of genetic alteration values: @@ -314,7 +307,6 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsFromNonCnaEvents( geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test genetic alteration are added of non-cna event: @@ -342,7 +334,6 @@ public void testImportCnaDiscreteLongDataIgnoresLineWithDuplicateGene() throws E geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces).importData(); // Test genetic alteration are deduplicated: @@ -364,7 +355,6 @@ public void testImportCnaDiscreteLongDataAddsPdAnnotations() throws Exception { geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces ).importData(); List genes = newArrayList(3983L, 27334L, 2115L); @@ -394,7 +384,6 @@ public void testImportCnaDiscreteLongData_changesProfileDatatypeFromDiscreteLong geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), noNamespaces ).importData(); @@ -418,7 +407,6 @@ public void testImportCnaDiscreteLongDataOnlyAddsSpecifiedCustomNamespaceColumns geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), namespacesToImport ).importData(); @@ -458,7 +446,6 @@ public void testImportCnaDiscreteLongDataImportsMissingNamespacesAsNull() throws geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), namespacesToImport ).importData(); @@ -507,7 +494,6 @@ public void testImportCnaDiscreteLongDataAddsCustomNamespaceColumnsForEachSample geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), namespaces ).importData(); @@ -549,7 +535,6 @@ public void testImportCnaDiscreteLongDataImportsCustomNamespaceColumnsAsNullWhen geneticProfile.getGeneticProfileId(), genePanel, DaoGeneOptimized.getInstance(), - DaoGeneticAlteration.getInstance(), namespaces ).importData(); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java index f8bcc335..68f8940d 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java @@ -167,7 +167,7 @@ private void runImportCnaData() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, false, DaoGeneOptimized.getInstance()); parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 999999207); @@ -231,7 +231,7 @@ private void runImportCnaData2() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test2.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, false, DaoGeneOptimized.getInstance()); parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 207); @@ -315,7 +315,7 @@ private void runImportRnaData1() throws DaoException, IOException{ // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/mrna_test.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneOptimized.getInstance()); parser.importData(); ConsoleUtil.showMessages(); @@ -368,7 +368,7 @@ public void testImportmRnaData2() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_expression2.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneOptimized.getInstance()); parser.importData(); // check if expected warnings are given: @@ -460,7 +460,7 @@ public void testImportRppaData() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_rppa.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneOptimized.getInstance()); parser.importData(); ConsoleUtil.showMessages(); From 65c8b119e4bc3fd02af9689c99852ce3d8d18ee7 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 24 May 2024 08:56:02 +0200 Subject: [PATCH 080/130] Add CNA DISCRETE LONG to study_es0_inc test dataset --- tests/system_tests_import_data.py | 3 ++ .../study_es_0_inc/data_cna_discrete_long.txt | 37 +++++++++++++++++++ .../study_es_0_inc/meta_cna_discrete_long.txt | 9 +++++ 3 files changed, 49 insertions(+) create mode 100644 tests/test_data/study_es_0_inc/data_cna_discrete_long.txt create mode 100644 tests/test_data/study_es_0_inc/meta_cna_discrete_long.txt diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index 5fd45a69..f795c122 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -109,6 +109,8 @@ def test_incremental_load(self, run_java, locate_jar): '--meta', f'{data_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_mutations_extended.maf', '--noprogress') cna_discrete_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_discrete.txt', '--noprogress') + cna_discrete_long_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_discrete_long.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_discrete_long.txt', '--noprogress') cna_log2_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_log2.txt', '--noprogress') expression_median_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', @@ -128,6 +130,7 @@ def test_incremental_load(self, run_java, locate_jar): clinical_sample_call, mutation_call, cna_discrete_call, + cna_discrete_long_call, cna_log2_call, expression_median_call, methylation_hm27_call, diff --git a/tests/test_data/study_es_0_inc/data_cna_discrete_long.txt b/tests/test_data/study_es_0_inc/data_cna_discrete_long.txt new file mode 100644 index 00000000..cd357405 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_discrete_long.txt @@ -0,0 +1,37 @@ +Hugo_Symbol Entrez_Gene_Id Sample_Id Value cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +AKT3 10000 TCGA-C8-A12K-01 0 +AKT3 10000 TCGA-BH-NON-EXIST -2 +AKT3 10000 TCGA-AO-A129-01 -2 +AKT1 207 TCGA-C8-A12K-01 -1 +AKT1 207 TCGA-BH-NON-EXIST 2 +AKT1 207 TCGA-AO-A129-01 2 +# All after the pipe has to be removed +AKT2|TEST 208 TCGA-C8-A12K-01 -2 +AKT2|TEST 208 TCGA-BH-NON-EXIST 2 +AKT2|TEST 208 TCGA-AO-A129-01 -1 Putative_Driver Test driver Class 1 Class annotation +HRAS 3265 TCGA-C8-A12K-01 2 +HRAS 3265 TCGA-BH-NON-EXIST 2 +HRAS 3265 TCGA-AO-A129-01 0 +KRAS 3845 TCGA-C8-A12K-01 0 Class 2 Class annotation +KRAS 3845 TCGA-BH-NON-EXIST -2 +KRAS 3845 TCGA-AO-A129-01 2 Putative_Passenger Test passenger Class 2 Class annotation +# This gene absent in this file, but it's still part of the profile and has to be updated +#ATM 472 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 TCGA-C8-A12K-01 -2 + 4893 TCGA-BH-NON-EXIST -2 + 4893 TCGA-AO-A129-01 -1 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 TCGA-C8-A12K-01 2 +BRCA1 TCGA-BH-NON-EXIST 2 +BRCA1 TCGA-AO-A129-01 0 +BRAF 673 TCGA-C8-A12K-01 2 +BRAF 673 TCGA-BH-NON-EXIST -2 +BRAF 673 TCGA-AO-A129-01 -2 +BRCA2 675 TCGA-C8-A12K-01 -1.5 +BRCA2 675 TCGA-BH-NON-EXIST 2 +BRCA2 675 TCGA-AO-A129-01 0 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 TCGA-C8-A12K-01 -2 Putative_Driver +CDK1 983 TCGA-BH-NON-EXIST -2 +CDK1 983 TCGA-AO-A129-01 2 Putative_Passenger Test passenger diff --git a/tests/test_data/study_es_0_inc/meta_cna_discrete_long.txt b/tests/test_data/study_es_0_inc/meta_cna_discrete_long.txt new file mode 100644 index 00000000..0a353729 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_discrete_long.txt @@ -0,0 +1,9 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete_long.txt +namespaces: CustomNamespace From 0bf6bf2786ddcce219ffa5e759e38bbc7c52f59a Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 24 May 2024 09:16:27 +0200 Subject: [PATCH 081/130] Remove unused code --- .../cbio/portal/scripts/ImportCnaDiscreteLongData.java | 7 ------- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 2 -- 2 files changed, 9 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 5590e57a..4a44d453 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -28,7 +28,6 @@ import org.mskcc.cbio.portal.dao.DaoCnaEvent; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; -import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.dao.DaoSample; import org.mskcc.cbio.portal.dao.DaoSampleProfile; @@ -50,7 +49,6 @@ import java.io.File; import java.io.FileReader; import java.util.ArrayList; -import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Optional; @@ -81,8 +79,6 @@ public class ImportCnaDiscreteLongData { private final ArrayList sampleIdGeneticProfileIds = new ArrayList<>(); private ArrayList orderedSampleList; - private HashMap> geneticAlterationMap; - public ImportCnaDiscreteLongData( File cnaFile, int geneticProfileId, @@ -136,9 +132,6 @@ private void doImportData() throws Exception { int lineIndex = 1; String[] headerParts = line.split("\t", -1); this.cnaUtil = new CnaUtil(headerParts, this.namespaces); - if (updateMode) { - geneticAlterationMap = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(geneticProfile.getGeneticProfileId(), null); - } boolean isDiscretizedCnaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index f1bd0bb5..81ea75ab 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -38,7 +38,6 @@ import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneset; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; -import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; import org.mskcc.cbio.portal.dao.DaoSample; import org.mskcc.cbio.portal.dao.DaoSampleProfile; import org.mskcc.cbio.portal.dao.JdbcUtil; @@ -49,7 +48,6 @@ import org.mskcc.cbio.portal.model.GeneticAlterationType; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.model.Sample; -import org.mskcc.cbio.portal.util.ArrayUtil; import org.mskcc.cbio.portal.util.CnaUtil; import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.EntrezValidator; From cc80e56cc03c14e296a0cfea86d6967eb8833114 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 17:19:44 +0200 Subject: [PATCH 082/130] Make validation to pass for CNA long and study_es_0_inc data --- scripts/importer/validateData.py | 2 +- tests/system_tests_import_data.py | 3 --- .../{ => cna_long}/data_cna_discrete_long.txt | 6 ------ .../{ => cna_long}/meta_cna_discrete_long.txt | 3 +-- 4 files changed, 2 insertions(+), 12 deletions(-) rename tests/test_data/study_es_0_inc/{ => cna_long}/data_cna_discrete_long.txt (74%) rename tests/test_data/study_es_0_inc/{ => cna_long}/meta_cna_discrete_long.txt (89%) diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 8c77e018..eb5a0392 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -4718,7 +4718,7 @@ def process_metadata_files(directory, portal_instance, logger, relaxed_mode, str if stable_id in stable_ids: # stable id already used in other meta file, give error: logger.error( - 'stable_id repeated. It should be unique across all files in a study', + 'stable_id repeated. It should be unique across all files in a directory', extra={'filename_': filename, 'cause': stable_id}) else: diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index f795c122..5fd45a69 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -109,8 +109,6 @@ def test_incremental_load(self, run_java, locate_jar): '--meta', f'{data_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_mutations_extended.maf', '--noprogress') cna_discrete_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_discrete.txt', '--noprogress') - cna_discrete_long_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', - '--meta', f'{data_directory}/meta_cna_discrete_long.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_discrete_long.txt', '--noprogress') cna_log2_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_log2.txt', '--noprogress') expression_median_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', @@ -130,7 +128,6 @@ def test_incremental_load(self, run_java, locate_jar): clinical_sample_call, mutation_call, cna_discrete_call, - cna_discrete_long_call, cna_log2_call, expression_median_call, methylation_hm27_call, diff --git a/tests/test_data/study_es_0_inc/data_cna_discrete_long.txt b/tests/test_data/study_es_0_inc/cna_long/data_cna_discrete_long.txt similarity index 74% rename from tests/test_data/study_es_0_inc/data_cna_discrete_long.txt rename to tests/test_data/study_es_0_inc/cna_long/data_cna_discrete_long.txt index cd357405..9c3b1427 100644 --- a/tests/test_data/study_es_0_inc/data_cna_discrete_long.txt +++ b/tests/test_data/study_es_0_inc/cna_long/data_cna_discrete_long.txt @@ -5,7 +5,6 @@ AKT3 10000 TCGA-AO-A129-01 -2 AKT1 207 TCGA-C8-A12K-01 -1 AKT1 207 TCGA-BH-NON-EXIST 2 AKT1 207 TCGA-AO-A129-01 2 -# All after the pipe has to be removed AKT2|TEST 208 TCGA-C8-A12K-01 -2 AKT2|TEST 208 TCGA-BH-NON-EXIST 2 AKT2|TEST 208 TCGA-AO-A129-01 -1 Putative_Driver Test driver Class 1 Class annotation @@ -15,13 +14,9 @@ HRAS 3265 TCGA-AO-A129-01 0 KRAS 3845 TCGA-C8-A12K-01 0 Class 2 Class annotation KRAS 3845 TCGA-BH-NON-EXIST -2 KRAS 3845 TCGA-AO-A129-01 2 Putative_Passenger Test passenger Class 2 Class annotation -# This gene absent in this file, but it's still part of the profile and has to be updated -#ATM 472 -# This line missing the hugo symbol and the gene has to be detected by entrez id 4893 TCGA-C8-A12K-01 -2 4893 TCGA-BH-NON-EXIST -2 4893 TCGA-AO-A129-01 -1 -# This line missing the entrez id and the gene has to be detected by hugo symbol BRCA1 TCGA-C8-A12K-01 2 BRCA1 TCGA-BH-NON-EXIST 2 BRCA1 TCGA-AO-A129-01 0 @@ -31,7 +26,6 @@ BRAF 673 TCGA-AO-A129-01 -2 BRCA2 675 TCGA-C8-A12K-01 -1.5 BRCA2 675 TCGA-BH-NON-EXIST 2 BRCA2 675 TCGA-AO-A129-01 0 -# This gene is new! the empty values should be set for the already existing samples in the database CDK1 983 TCGA-C8-A12K-01 -2 Putative_Driver CDK1 983 TCGA-BH-NON-EXIST -2 CDK1 983 TCGA-AO-A129-01 2 Putative_Passenger Test passenger diff --git a/tests/test_data/study_es_0_inc/meta_cna_discrete_long.txt b/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt similarity index 89% rename from tests/test_data/study_es_0_inc/meta_cna_discrete_long.txt rename to tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt index 0a353729..d1ce3813 100644 --- a/tests/test_data/study_es_0_inc/meta_cna_discrete_long.txt +++ b/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt @@ -1,9 +1,8 @@ cancer_study_identifier: study_es_0 genetic_alteration_type: COPY_NUMBER_ALTERATION -datatype: DISCRETE +datatype: DISCRETE_LONG stable_id: gistic show_profile_in_analysis_tab: true profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. profile_name: Putative copy-number alterations from GISTIC data_filename: data_cna_discrete_long.txt -namespaces: CustomNamespace From 4070e683f88aa155ab2c3f703e168461db0f7a3c Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 24 May 2024 15:16:24 +0200 Subject: [PATCH 083/130] Implement incremental upload for gene panel matrix The uploader was working in incremental manner already. I had to add tests for those only. I had to implement incremental upload for gene panel matrix from differend data (CNA, Mutations) uploaders though. --- scripts/importer/cbioportal_common.py | 1 + .../cbio/portal/dao/DaoSampleProfile.java | 63 ++++++++----- .../scripts/ImportCnaDiscreteLongData.java | 25 +++-- .../scripts/ImportExtendedMutationData.java | 72 ++++++++++---- .../scripts/ImportGenePanelProfileMap.java | 4 + .../portal/scripts/ImportTabDelimData.java | 26 ++++-- .../cbio/portal/util/GeneticProfileUtil.java | 2 +- ...IncrementalCopyNumberAlterationImport.java | 35 ++++++- .../TestIncrementalGenePanelMatrixImport.java | 93 +++++++++++++++++++ .../TestIncrementalGenericAssayImporter.java | 12 +++ .../TestIncrementalMutationsImport.java | 8 ++ .../meta_cna_discrete.txt | 1 + .../meta_cna_discrete_long.txt | 1 + .../data_gene_panel_matrix.txt | 2 + .../meta_gene_panel_matrix.txt | 4 + .../generic_assay/meta_treatment_ic50.txt | 1 + .../insert_mutation_data/meta_mutations.txt | 3 +- .../update_mutation_data/meta_mutations.txt | 3 +- src/test/resources/seed_mini.sql | 3 + tests/system_tests_import_data.py | 3 + .../cna_long/meta_cna_discrete_long.txt | 1 + .../study_es_0_inc/data_gene_panel_matrix.txt | 6 ++ .../study_es_0_inc/meta_gene_panel_matrix.txt | 4 + .../meta_mutations_extended.txt | 1 + .../study_es_0_inc/meta_treatment_ic50.txt | 3 +- 25 files changed, 319 insertions(+), 58 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenePanelMatrixImport.java create mode 100644 src/test/resources/incremental/gene_panel_matrix/data_gene_panel_matrix.txt create mode 100644 src/test/resources/incremental/gene_panel_matrix/meta_gene_panel_matrix.txt create mode 100644 tests/test_data/study_es_0_inc/data_gene_panel_matrix.txt create mode 100644 tests/test_data/study_es_0_inc/meta_gene_panel_matrix.txt diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index c68f68e3..fbb80b89 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -381,6 +381,7 @@ class MetaFileTypes(object): MetaFileTypes.GENERIC_ASSAY_BINARY, MetaFileTypes.GENERIC_ASSAY_CATEGORICAL, MetaFileTypes.TIMELINE, + MetaFileTypes.GENE_PANEL_MATRIX, ] IMPORTER_CLASSNAME_BY_META_TYPE = { diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java index acfb299d..5e895206 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java @@ -52,7 +52,7 @@ private DaoSampleProfile() {} private static final int NO_SUCH_PROFILE_ID = -1; private static final String TABLE_NAME = "sample_profile"; - public static int addSampleProfile(Integer sampleId, Integer geneticProfileId, Integer panelId) throws DaoException { + public static int addSampleProfile(Integer sampleId, Integer geneticProfileId, Integer panelId) throws DaoException { if (MySQLbulkLoader.isBulkLoad()) { // Add new record using bulk loader. Order of fields is: @@ -80,27 +80,19 @@ public static int addSampleProfile(Integer sampleId, Integer geneticProfileId, I ResultSet rs = null; try { - if (!sampleExistsInGeneticProfile(sampleId, geneticProfileId)) { - con = JdbcUtil.getDbConnection(DaoSampleProfile.class); - pstmt = con.prepareStatement - ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`) " - + "VALUES (?,?,?)"); - pstmt.setInt(1, sampleId); - pstmt.setInt(2, geneticProfileId); - if (panelId != null) { - pstmt.setInt(3, panelId); - } - else { - pstmt.setNull(3, java.sql.Types.INTEGER); - } - return pstmt.executeUpdate(); - } else { - // This should be an error, because the record already exists. - return 0; + con = JdbcUtil.getDbConnection(DaoSampleProfile.class); + pstmt = con.prepareStatement + ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`) VALUES (?,?,?)"); + pstmt.setInt(1, sampleId); + pstmt.setInt(2, geneticProfileId); + if (panelId != null) { + pstmt.setInt(3, panelId); } - } catch (NullPointerException e) { - throw new DaoException(e); - } catch (SQLException e) { + else { + pstmt.setNull(3, java.sql.Types.INTEGER); + } + return pstmt.executeUpdate(); + } catch (NullPointerException | SQLException e) { throw new DaoException(e); } finally { JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs); @@ -174,6 +166,35 @@ public static boolean sampleExistsInGeneticProfile(int sampleId, int geneticProf } } + public static Integer getPanelId(int sampleId, int geneticProfileId) + throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + + try { + con = JdbcUtil.getDbConnection(DaoSampleProfile.class); + pstmt = con.prepareStatement + ("SELECT PANEL_ID FROM sample_profile WHERE SAMPLE_ID = ? AND GENETIC_PROFILE_ID = ?"); + pstmt.setInt(1, sampleId); + pstmt.setInt(2, geneticProfileId); + rs = pstmt.executeQuery(); + if (rs.next()) { + int panelId = rs.getInt(1); + if (rs.wasNull()) { + return null; + } + return panelId; + } else { + throw new NoSuchElementException("No sample_profile with SAMPLE_ID=" + sampleId + " and GENETIC_PROFILE_ID=" + geneticProfileId); + } + } catch (NoSuchElementException | SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs); + } + } + public static int countSamplesInProfile(int geneticProfileId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 4a44d453..1c6cf96a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -65,7 +65,6 @@ public class ImportCnaDiscreteLongData { private final File cnaFile; private final int geneticProfileId; private GeneticAlterationImporter geneticAlterationGeneImporter; - private String genePanel; private final DaoGeneOptimized daoGene; private CnaUtil cnaUtil; private Set existingCnaEvents = new HashSet<>(); @@ -78,6 +77,7 @@ public class ImportCnaDiscreteLongData { private final ArrayList sampleIdGeneticProfileIds = new ArrayList<>(); private ArrayList orderedSampleList; + private final Integer genePanelId; public ImportCnaDiscreteLongData( File cnaFile, @@ -97,7 +97,7 @@ public ImportCnaDiscreteLongData( + " has not supported datatype: " + geneticProfile.getDatatype()); } - this.genePanel = genePanel; + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); this.daoGene = daoGene; this.updateMode = updateMode; } @@ -206,7 +206,7 @@ public void extractDataToImport( } throw new RuntimeException("Sample with stable id " + sampleIdStr + " is not found in the database."); } - createSampleProfile(sample); + ensureSampleProfileExists(sample); long entrezId = gene.getEntrezGeneId(); int sampleId = sample.getInternalId(); @@ -223,6 +223,18 @@ public void extractDataToImport( } + private void ensureSampleProfileExists(Sample sample) throws DaoException { + if (updateMode) { + upsertSampleProfile(sample); + } else { + createSampleProfileIfNotExists(sample); + } + } + + private void upsertSampleProfile(Sample sample) throws DaoException { + DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); + } + /** * Store all cna events related to a single gene */ @@ -338,15 +350,14 @@ private CanonicalGene getGene( * * @return boolean created or not */ - public boolean createSampleProfile( + public boolean createSampleProfileIfNotExists( Sample sample - ) throws Exception { + ) throws DaoException { boolean inDatabase = DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId); - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); SampleIdGeneticProfileId toCreate = new SampleIdGeneticProfileId(sample.getInternalId(), geneticProfileId); boolean isQueued = this.sampleIdGeneticProfileIds.contains(toCreate); if (!inDatabase && !isQueued) { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); + DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); this.sampleIdGeneticProfileIds.add(toCreate); return true; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 3da90645..79f025a4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -34,20 +34,48 @@ import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.databind.ObjectMapper; -import org.mskcc.cbio.portal.dao.*; -import org.mskcc.cbio.portal.model.*; -import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent; -import org.mskcc.cbio.portal.util.*; -import org.mskcc.cbio.maf.*; - import org.apache.commons.lang3.StringUtils; +import org.mskcc.cbio.maf.MafRecord; +import org.mskcc.cbio.maf.MafUtil; +import org.mskcc.cbio.portal.dao.DaoAlleleSpecificCopyNumber; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoMutation; +import org.mskcc.cbio.portal.dao.DaoReferenceGenome; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.AlleleSpecificCopyNumber; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.model.ExtendedMutation; +import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.ExtendedMutationUtil; +import org.mskcc.cbio.portal.util.GeneticProfileUtil; +import org.mskcc.cbio.portal.util.GlobalProperties; +import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.StableIdUtil; import java.io.BufferedReader; import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.*; -import java.util.regex.*; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.TreeSet; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * Import an extended mutation file. @@ -59,7 +87,7 @@ *
* @author Selcuk Onur Sumer */ -public class ImportExtendedMutationData{ +public class ImportExtendedMutationData { private File mutationFile; private int geneticProfileId; @@ -69,12 +97,13 @@ public class ImportExtendedMutationData{ private int samplesSkipped = 0; private Set sampleSet = new HashSet(); private Set geneSet = new HashSet(); - private String genePanel; private Set filteredMutations = new HashSet(); private Set namespaces = new HashSet(); private Pattern SEQUENCE_SAMPLES_REGEX = Pattern.compile("^.*sequenced_samples:(.*)$"); private final String ASCN_NAMESPACE = "ASCN"; + private final Integer genePanelId; + private final boolean overwriteExisting; /** @@ -89,7 +118,7 @@ public ImportExtendedMutationData(File mutationFile, int geneticProfileId, Strin this.mutationFile = mutationFile; this.geneticProfileId = geneticProfileId; this.swissprotIsAccession = false; - this.genePanel = genePanel; + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);; this.filteredMutations = filteredMutations; // create default MutationFilter @@ -428,7 +457,7 @@ public void importData() throws IOException, DaoException { mutations.put(mutation,mutation); } if(!sampleSet.contains(sample.getStableId())) { - addSampleProfileRecord(sample); + ensureSampleProfileExists(sample); } // update ascn object with mutation unique key details if (ascn != null){ @@ -600,17 +629,28 @@ private List getSequencedSamples(String sequencedSamplesIDList, GeneticP private void addSampleProfileRecords(List sequencedSamples) throws DaoException { for (Sample sample : sequencedSamples) { - addSampleProfileRecord(sample); + ensureSampleProfileExists(sample); } if( MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); } } - private void addSampleProfileRecord(Sample sample) throws DaoException { + private void ensureSampleProfileExists(Sample sample) throws DaoException { + if (overwriteExisting) { + upsertSampleProfile(sample); + } else { + createSampleProfileIfNotExists(sample); + } + } + + private void upsertSampleProfile(Sample sample) throws DaoException { + DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); + } + + private void createSampleProfileIfNotExists(Sample sample) throws DaoException { if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); + DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java index 483fa7c2..06a52da6 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java @@ -85,6 +85,10 @@ public void run() { "gene panel file" ).withRequiredArg().describedAs( "meta_file.txt" ).ofType( String.class ); parser.accepts("noprogress", "this option can be given to avoid the messages regarding memory usage and % complete"); + // supported by the uploader already. Added for uniformity, to do not cause error when upstream software uses this flag + parser.accepts("overwrite-existing", + "Enables re-uploading molecular data that already exist for the given profile and sample.") + .withOptionalArg().describedAs("overwrite-existing").ofType(String.class); OptionSet options; try { options = parser.parse( args ); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 81ea75ab..e138bb8a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -92,7 +92,6 @@ public class ImportTabDelimData { private int entriesSkipped = 0; private int nrExtraRecords = 0; private Set arrayIdSet = new HashSet(); - private String genePanel; private String genericEntityProperties; private File pdAnnotationsFile; private Map, Map> pdAnnotations; @@ -104,6 +103,7 @@ public class ImportTabDelimData { private boolean updateMode; private ArrayList orderedSampleList; + private final Integer genePanelId; /** * Constructor. @@ -170,7 +170,7 @@ public ImportTabDelimData( ) { this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); this.updateMode = updateMode; this.daoGene = daoGene; this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); @@ -280,7 +280,7 @@ private void doImportData() throws IOException, DaoException { throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(sampleIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath()); } } - ensureSampleGeneticProfile(sample); + ensureSampleProfileExists(sample); orderedSampleList.add(sample.getInternalId()); if (pdAnnotationsForStableSampleIds != null) { Set> keys = new HashSet<>(pdAnnotationsForStableSampleIds.keySet()); @@ -417,13 +417,21 @@ private void doImportData() throws IOException, DaoException { } } - private void ensureSampleGeneticProfile(Sample sample) throws DaoException { + private void ensureSampleProfileExists(Sample sample) throws DaoException { + if (updateMode) { + upsertSampleProfile(sample); + } else { + createSampleProfileIfNotExists(sample); + } + } + + private void upsertSampleProfile(Sample sample) throws DaoException { + DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); + } + + private void createSampleProfileIfNotExists(Sample sample) throws DaoException { if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - if (updateMode) { - DaoSampleProfile.deleteRecords(List.of(sample.getInternalId()), List.of(geneticProfileId)); - } - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); + DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); } } diff --git a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java index 3e27de9f..748ffd54 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileUtil.java @@ -88,7 +88,7 @@ public static boolean outlierExpressionSelected(HashSet geneticProfileId public static int getGenePanelId(String panelId) { GenePanel genePanel = DaoGenePanel.getGenePanelByStableId(panelId); if (genePanel == null) { - throw new NoSuchElementException("No gene panel with id " + genePanel); + throw new NoSuchElementException("Gene panel with id " + panelId + " not found."); } return genePanel.getInternalId(); } diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java index f2bc2b1e..0e1d8a68 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java @@ -26,9 +26,12 @@ import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoCnaEvent; import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGenePanel; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GenePanel; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportProfileData; import org.springframework.test.context.ContextConfiguration; @@ -49,6 +52,8 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; @@ -85,7 +90,7 @@ public class TestIncrementalCopyNumberAlterationImport { final Set afterSampleIds = new HashSet<>(beforeSampleIds); { afterSampleIds.add(newSampleId); } - @Parameterized.Parameters + @Parameterized.Parameters(name = "{0}") public static Collection primeNumbers() { return Arrays.asList(new Object[][] { { "meta_cna_discrete.txt", "data_cna_discrete.txt" }, @@ -108,6 +113,16 @@ public void testDiscreteCNA() throws DaoException { HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + Map beforeSampleIdToPanelId = new HashMap<>(); + for (int sampleId : noChangeSampleIds) { + try { + beforeSampleIdToPanelId.put(sampleId, + DaoSampleProfile.getPanelId(sampleId, discreteCNAProfile.getGeneticProfileId())); + } catch (DaoException e) { + throw new RuntimeException(e); + } + } + List allCnaLevels = Arrays.stream(CNA.values()).map(CNA::getCode).toList(); Set beforeCnaEventsSampleIds = Set.of(4, 13, 14, updateSampleId); List beforeSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), @@ -188,6 +203,24 @@ public void testDiscreteCNA() throws DaoException { newGeneEntrezId, CNA.AMP ), updatedSampleEntrezGeneIdToCnaAlteration); + + Map afterSampleIdToPanelId = new HashMap<>(); + for (int sampleId : noChangeSampleIds) { + try { + afterSampleIdToPanelId.put(sampleId, + DaoSampleProfile.getPanelId(sampleId, discreteCNAProfile.getGeneticProfileId())); + } catch (DaoException e) { + throw new RuntimeException(e); + } + } + assertEquals(beforeSampleIdToPanelId, afterSampleIdToPanelId); + + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLCNADS"); + for (int sampleId : Set.of(updateSampleId, newSampleId)) { + assertEquals("Sample profile has to point to TSTGNPNLCNADS panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(sampleId, discreteCNAProfile.getGeneticProfileId())); + } } private TestContextManager testContextManager; diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenePanelMatrixImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenePanelMatrixImport.java new file mode 100644 index 00000000..e1df66a6 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenePanelMatrixImport.java @@ -0,0 +1,93 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGenePanel; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ExtendedMutation; +import org.mskcc.cbio.portal.model.GenePanel; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.ArrayList; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; + +/** + * Tests Incremental Import of Gene Panel Matrix Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalGenePanelMatrixImport { + + /** + * Test incremental upload + */ + @Test + public void testIncrementalUpload() throws DaoException { + File dataFolder = new File("src/test/resources/incremental/gene_panel_matrix/"); + File metaFile = new File(dataFolder, "meta_gene_panel_matrix.txt"); + File dataFile = new File(dataFolder, "data_gene_panel_matrix.txt"); + + ImportGenePanelProfileMap importGenePanelProfileMap = new ImportGenePanelProfileMap(new String[] { + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importGenePanelProfileMap.run(); + + GenePanel mutationGenePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLMUTEXT"); + GeneticProfile mutationsProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mutations"); + GenePanel longGenePanel = DaoGenePanel.getGenePanelByStableId("TESTPANEL_CNA_DISCRETE_LONG_FORMAT"); + GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); + GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub"); + Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), "TCGA-A1-A0SB-01"); + assertEquals(mutationGenePanel.getInternalId(), + DaoSampleProfile.getPanelId(sample.getInternalId(), mutationsProfile.getGeneticProfileId())); + assertEquals(longGenePanel.getInternalId(), + DaoSampleProfile.getPanelId(sample.getInternalId(), geneticProfile.getGeneticProfileId())); + assertNull(DaoSampleProfile.getPanelId(sample.getInternalId(), ic50Profile.getGeneticProfileId())); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java index e0ef8cf5..da681e5b 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java @@ -22,9 +22,12 @@ import org.junit.runner.RunWith; import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGenePanel; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticEntity; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.model.GenePanel; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportProfileData; import org.springframework.test.annotation.Rollback; @@ -36,9 +39,11 @@ import java.io.IOException; import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Set; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertThrows; @@ -121,6 +126,13 @@ public void testGenericAssay() throws DaoException { assertEquals("0.1", afterResult.get(lbw242EntityId).get(newSampleId)); assertEquals(">~8", afterResult.get(lbw242EntityId).get(updateSampleId)); assertNotNull("New generic entity has to be added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); + assertFalse("This sample should not get sample_profile", DaoSampleProfile.sampleExistsInGeneticProfile(noChangeSampleId, ic50Profile.getGeneticProfileId())); + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLGENASS"); + for (int sampleId : Set.of(updateSampleId, newSampleId)) { + assertEquals("Sample profile has to point to TSTGNPNLGENASS panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(sampleId, ic50Profile.getGeneticProfileId())); + } } /** diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java index e4def0ed..2a63df78 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMutationsImport.java @@ -88,6 +88,10 @@ public void testInsertNewMutationProfileDataForExistingSampleAndProfile() throws assertNotNull(insertedMutations.get(0).getEvent()); assertNotNull(insertedMutations.get(1).getEvent()); assertNotNull(insertedMutations.get(2).getEvent()); + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLMUTEXT"); + assertEquals("Sample profile has to point to TSTGNPNLMUTEXT panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(mutationDataSample.getInternalId(), mutationGeneticProfile.getGeneticProfileId())); } /** * Test updating mutation profile data for existing sample. The mutation genetic profile exists. @@ -124,6 +128,10 @@ public void testUpdateMutationProfileDataForExistingSampleAndProfile() throws Da Set entrezIds = insertedMutations.stream().map(m -> m.getEntrezGeneId()).collect(Collectors.toSet()); Set expected = Set.of(207L, 208L, 672L); assertEquals(expected, entrezIds); + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLMUTEXT"); + assertEquals("Sample profile has to point to TSTGNPNLMUTEXT panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(mutationDataSample.getInternalId(), mutationGeneticProfile.getGeneticProfileId())); } } diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt index 827c31dd..2cdb4613 100644 --- a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt @@ -8,3 +8,4 @@ profile_name: Putative copy-number alterations from GISTIC data_filename: data_cna_discrete.txt pd_annotations_filename: data_cna_pd_annotations.txt namespaces: CustomNamespace +gene_panel: TSTGNPNLCNADS \ No newline at end of file diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt index 4155601b..c3172961 100644 --- a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete_long.txt @@ -7,3 +7,4 @@ profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygo profile_name: Putative copy-number alterations from GISTIC data_filename: data_cna_discrete_long.txt namespaces: CustomNamespace +gene_panel: TSTGNPNLCNADS diff --git a/src/test/resources/incremental/gene_panel_matrix/data_gene_panel_matrix.txt b/src/test/resources/incremental/gene_panel_matrix/data_gene_panel_matrix.txt new file mode 100644 index 00000000..ca2bafda --- /dev/null +++ b/src/test/resources/incremental/gene_panel_matrix/data_gene_panel_matrix.txt @@ -0,0 +1,2 @@ +SAMPLE_ID mutations gistic treatment_ic50 +TCGA-A1-A0SB-01 TSTGNPNLMUTEXT TESTPANEL_CNA_DISCRETE_LONG_FORMAT WXS/WGS diff --git a/src/test/resources/incremental/gene_panel_matrix/meta_gene_panel_matrix.txt b/src/test/resources/incremental/gene_panel_matrix/meta_gene_panel_matrix.txt new file mode 100644 index 00000000..f0a7385c --- /dev/null +++ b/src/test/resources/incremental/gene_panel_matrix/meta_gene_panel_matrix.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: GENE_PANEL_MATRIX +datatype: GENE_PANEL_MATRIX +data_filename: data_gene_panel_matrix.txt diff --git a/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt b/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt index 6ec6cdc5..477d01fd 100644 --- a/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt +++ b/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt @@ -10,3 +10,4 @@ show_profile_in_analysis_tab: true pivot_threshold_value: 0.1 value_sort_order: ASC generic_entity_meta_properties: NAME,DESCRIPTION,URL +gene_panel: TSTGNPNLGENASS diff --git a/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt b/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt index 37915344..2282fbab 100644 --- a/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt +++ b/src/test/resources/incremental/insert_mutation_data/meta_mutations.txt @@ -5,4 +5,5 @@ datatype: MAF show_profile_in_analysis_tab: true profile_name: Test Mutations profile_description: Mutation data for testing. -data_filename: data_mutations_extended.txt \ No newline at end of file +data_filename: data_mutations_extended.txt +gene_panel: TSTGNPNLMUTEXT diff --git a/src/test/resources/incremental/update_mutation_data/meta_mutations.txt b/src/test/resources/incremental/update_mutation_data/meta_mutations.txt index 37915344..2282fbab 100644 --- a/src/test/resources/incremental/update_mutation_data/meta_mutations.txt +++ b/src/test/resources/incremental/update_mutation_data/meta_mutations.txt @@ -5,4 +5,5 @@ datatype: MAF show_profile_in_analysis_tab: true profile_name: Test Mutations profile_description: Mutation data for testing. -data_filename: data_mutations_extended.txt \ No newline at end of file +data_filename: data_mutations_extended.txt +gene_panel: TSTGNPNLMUTEXT diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index 1fa9d4e1..be554472 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -401,6 +401,9 @@ INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID -- gene_panel INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_CNA_DISCRETE_LONG_FORMAT','Some test panel'); +INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (2,'TSTGNPNLCNADS','The CNA Discrete test panel'); +INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (3,'TSTGNPNLMUTEXT','The mutation extended test panel'); +INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (4,'TSTGNPNLGENASS','The generic assay test panel'); -- genetic_alteration INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (2,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 10000),'0,0,1,2,0,1,1,1,0,1,1,1,0,1,'); diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index 5fd45a69..577c2a85 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -121,6 +121,8 @@ def test_incremental_load(self, run_java, locate_jar): '--meta', f'{data_directory}/meta_timeline.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_timeline.txt', '--noprogress') case_list_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists') + gene_panel_matrix_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap', '--overwrite-existing', + '--meta', f'{data_directory}/meta_gene_panel_matrix.txt', '--data', f'{data_directory}/data_gene_panel_matrix.txt', '--noprogress') self.assertCountEqual(run_java.call_args_list, [ call(*common_part, 'org.mskcc.cbio.portal.util.VersionUtil',), @@ -133,6 +135,7 @@ def test_incremental_load(self, run_java, locate_jar): methylation_hm27_call, treatment_ic50_call, timeline_call, + gene_panel_matrix_call, case_list_call, ]) diff --git a/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt b/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt index d1ce3813..df6ccdae 100644 --- a/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt +++ b/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt @@ -6,3 +6,4 @@ show_profile_in_analysis_tab: true profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. profile_name: Putative copy-number alterations from GISTIC data_filename: data_cna_discrete_long.txt +gene_panel: TSTGNPNLCNADS diff --git a/tests/test_data/study_es_0_inc/data_gene_panel_matrix.txt b/tests/test_data/study_es_0_inc/data_gene_panel_matrix.txt new file mode 100644 index 00000000..344837b7 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_gene_panel_matrix.txt @@ -0,0 +1,6 @@ +SAMPLE_ID mutations gistic +TEST-A2B8-01 TESTPANEL1 NA +TEST_SAMPLE_3 NA TESTPANEL1 +TCGA-BH-NEW-01 TESTPANEL1 TESTPANEL1 +TCGA-A1-A0SK-01 TESTPANEL2 TESTPANEL1 +TCGA-A1-A0SB-01 TESTPANEL2 TESTPANEL1 diff --git a/tests/test_data/study_es_0_inc/meta_gene_panel_matrix.txt b/tests/test_data/study_es_0_inc/meta_gene_panel_matrix.txt new file mode 100644 index 00000000..440f19c3 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_gene_panel_matrix.txt @@ -0,0 +1,4 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: GENE_PANEL_MATRIX +datatype: GENE_PANEL_MATRIX +data_filename: data_gene_panel_matrix.txt diff --git a/tests/test_data/study_es_0_inc/meta_mutations_extended.txt b/tests/test_data/study_es_0_inc/meta_mutations_extended.txt index 94df92aa..9215c831 100644 --- a/tests/test_data/study_es_0_inc/meta_mutations_extended.txt +++ b/tests/test_data/study_es_0_inc/meta_mutations_extended.txt @@ -8,3 +8,4 @@ profile_name: Mutations data_filename: data_mutations_extended.maf swissprot_identifier: name namespaces: Zygosity +gene_panel: TSTGNPNLMUTEXT diff --git a/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt index 0d3281cd..ced1af7d 100644 --- a/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt +++ b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt @@ -9,4 +9,5 @@ data_filename: data_treatment_ic50.txt show_profile_in_analysis_tab: true pivot_threshold_value: 0.1 value_sort_order: ASC -generic_entity_meta_properties: NAME,DESCRIPTION,URL \ No newline at end of file +generic_entity_meta_properties: NAME,DESCRIPTION,URL +gene_panel: TSTGNPNLGENASS From e8bbb341bb444fc5d2398ef564297ad7cf3d22e3 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 17:33:17 +0200 Subject: [PATCH 084/130] Make validation of study_es_0_inc data to pass --- scripts/importer/validateData.py | 2 +- .../study_es_0_inc/cna_long/meta_cna_discrete_long.txt | 1 - tests/test_data/study_es_0_inc/meta_mutations_extended.txt | 1 - tests/test_data/study_es_0_inc/meta_treatment_ic50.txt | 1 - 4 files changed, 1 insertion(+), 4 deletions(-) diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index eb5a0392..3747b0bd 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -3386,7 +3386,7 @@ def checkLine(self, data): sample_ids_panel_dict[sample_id] = data[self.mutation_stable_id_index - 1] # Sample ID has been removed from list, so subtract 1 position. if data[self.mutation_stable_id_index - 1] != 'NA': - if sample_id not in mutation_sample_ids: + if mutation_sample_ids is not None and sample_id not in mutation_sample_ids: self.logger.error('Sample ID has mutation gene panel, but is not in the sequenced case list', extra={'line_number': self.line_number, 'cause': sample_id}) diff --git a/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt b/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt index df6ccdae..d1ce3813 100644 --- a/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt +++ b/tests/test_data/study_es_0_inc/cna_long/meta_cna_discrete_long.txt @@ -6,4 +6,3 @@ show_profile_in_analysis_tab: true profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. profile_name: Putative copy-number alterations from GISTIC data_filename: data_cna_discrete_long.txt -gene_panel: TSTGNPNLCNADS diff --git a/tests/test_data/study_es_0_inc/meta_mutations_extended.txt b/tests/test_data/study_es_0_inc/meta_mutations_extended.txt index 9215c831..94df92aa 100644 --- a/tests/test_data/study_es_0_inc/meta_mutations_extended.txt +++ b/tests/test_data/study_es_0_inc/meta_mutations_extended.txt @@ -8,4 +8,3 @@ profile_name: Mutations data_filename: data_mutations_extended.maf swissprot_identifier: name namespaces: Zygosity -gene_panel: TSTGNPNLMUTEXT diff --git a/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt index ced1af7d..edc5ef22 100644 --- a/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt +++ b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt @@ -10,4 +10,3 @@ show_profile_in_analysis_tab: true pivot_threshold_value: 0.1 value_sort_order: ASC generic_entity_meta_properties: NAME,DESCRIPTION,URL -gene_panel: TSTGNPNLGENASS From feed06cfb61d8d0cc054d5738682bc93fba70898 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 24 May 2024 17:25:24 +0200 Subject: [PATCH 085/130] Implement incremental upload of structural variants data I removed DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); as it does not seem to be needed. it does not make any sense to store samples in genetic_profile_samples, if you don't use genetic_alteration table at all. --- scripts/importer/cbioportal_common.py | 1 + .../cbio/portal/dao/DaoStructuralVariant.java | 29 ++++ .../cbio/portal/model/StructuralVariant.java | 54 ++++++++ .../portal/scripts/ImportProfileData.java | 3 +- .../scripts/ImportStructuralVariantData.java | 50 ++++--- ...stIncrementalStructuralVariantsImport.java | 127 ++++++++++++++++++ .../TestImportStructuralVariantData.java | 8 +- .../data_structural_variants.txt | 4 + .../meta_structural_variants.txt | 10 ++ src/test/resources/seed_mini.sql | 1 + tests/system_tests_import_data.py | 3 + .../data_structural_variants.txt | 10 ++ .../meta_structural_variants.txt | 10 ++ 13 files changed, 285 insertions(+), 25 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java create mode 100644 src/test/resources/incremental/structural_variants/data_structural_variants.txt create mode 100644 src/test/resources/incremental/structural_variants/meta_structural_variants.txt create mode 100644 tests/test_data/study_es_0_inc/data_structural_variants.txt create mode 100644 tests/test_data/study_es_0_inc/meta_structural_variants.txt diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index fbb80b89..5f5895d1 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -382,6 +382,7 @@ class MetaFileTypes(object): MetaFileTypes.GENERIC_ASSAY_CATEGORICAL, MetaFileTypes.TIMELINE, MetaFileTypes.GENE_PANEL_MATRIX, + MetaFileTypes.STRUCTURAL_VARIANT, ] IMPORTER_CLASSNAME_BY_META_TYPE = { diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java index a11026a8..8940e06c 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java @@ -29,7 +29,9 @@ import java.sql.ResultSet; import java.sql.SQLException; import java.util.ArrayList; +import java.util.Collections; import java.util.List; +import java.util.Set; public class DaoStructuralVariant { @@ -151,6 +153,33 @@ public static void addStructuralVariantToBulkLoader(StructuralVariant structural } } + public static void deleteStructuralVariants(int geneticProfileId, Set sampleIds) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoGene.class); + pstmt = con.prepareStatement("DELETE structural_variant, alteration_driver_annotation" + + " FROM structural_variant" + + " LEFT JOIN alteration_driver_annotation" + + " ON alteration_driver_annotation.GENETIC_PROFILE_ID = structural_variant.GENETIC_PROFILE_ID" + + " AND alteration_driver_annotation.SAMPLE_ID = structural_variant.SAMPLE_ID" + + " WHERE structural_variant.GENETIC_PROFILE_ID=? AND structural_variant.SAMPLE_ID IN (" + + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + + ")"); + int parameterIndex = 1; + pstmt.setInt(parameterIndex++, geneticProfileId); + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); + } + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoGene.class, con, pstmt, rs); + } + } + public static long getLargestInternalId() throws DaoException { Connection con = null; PreparedStatement pstmt = null; diff --git a/src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java b/src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java index 89c8352d..60cf8399 100644 --- a/src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java +++ b/src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java @@ -468,4 +468,58 @@ public String getAnnotationJson() { public void setAnnotationJson(String annotationJson) { this.annotationJson = annotationJson; } + + @Override + public String toString() { + return "StructuralVariant{" + + "internalId=" + internalId + + ", geneticProfileId=" + geneticProfileId + + ", structuralVariantId=" + structuralVariantId + + ", sampleIdInternal=" + sampleIdInternal + + ", sampleId='" + sampleId + '\'' + + ", site1EntrezGeneId=" + site1EntrezGeneId + + ", site1HugoSymbol='" + site1HugoSymbol + '\'' + + ", site1EnsemblTranscriptId='" + site1EnsemblTranscriptId + '\'' + + ", site1Chromosome='" + site1Chromosome + '\'' + + ", site1Position=" + site1Position + + ", site1Contig='" + site1Contig + '\'' + + ", site1Region='" + site1Region + '\'' + + ", site1RegionNumber=" + site1RegionNumber + + ", site1Description='" + site1Description + '\'' + + ", site2EntrezGeneId=" + site2EntrezGeneId + + ", site2HugoSymbol='" + site2HugoSymbol + '\'' + + ", site2EnsemblTranscriptId='" + site2EnsemblTranscriptId + '\'' + + ", site2Chromosome='" + site2Chromosome + '\'' + + ", site2Position=" + site2Position + + ", site2Contig='" + site2Contig + '\'' + + ", site2Region='" + site2Region + '\'' + + ", site2RegionNumber=" + site2RegionNumber + + ", site2Description='" + site2Description + '\'' + + ", site2EffectOnFrame='" + site2EffectOnFrame + '\'' + + ", ncbiBuild='" + ncbiBuild + '\'' + + ", dnaSupport='" + dnaSupport + '\'' + + ", rnaSupport='" + rnaSupport + '\'' + + ", normalReadCount=" + normalReadCount + + ", tumorReadCount=" + tumorReadCount + + ", normalVariantCount=" + normalVariantCount + + ", tumorVariantCount=" + tumorVariantCount + + ", normalPairedEndReadCount=" + normalPairedEndReadCount + + ", tumorPairedEndReadCount=" + tumorPairedEndReadCount + + ", normalSplitReadCount=" + normalSplitReadCount + + ", tumorSplitReadCount=" + tumorSplitReadCount + + ", annotation='" + annotation + '\'' + + ", breakpointType='" + breakpointType + '\'' + + ", connectionType='" + connectionType + '\'' + + ", eventInfo='" + eventInfo + '\'' + + ", variantClass='" + variantClass + '\'' + + ", length=" + length + + ", comments='" + comments + '\'' + + ", svStatus='" + svStatus + '\'' + + ", driverFilter='" + driverFilter + '\'' + + ", driverFilterAnn='" + driverFilterAnn + '\'' + + ", driverTiersFilter='" + driverTiersFilter + '\'' + + ", driverTiersFilterAnn='" + driverTiersFilterAnn + '\'' + + ", annotationJson='" + annotationJson + '\'' + + '}'; + } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index a02e7f4a..a35e8c29 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -108,7 +108,8 @@ public void run() { dataFile, geneticProfile.getGeneticProfileId(), genePanel, - namespaces + namespaces, + overwriteExisting ); importer.importData(); } else if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index 50ebd329..c72d1f3a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -46,22 +46,25 @@ public class ImportStructuralVariantData { // Initialize variables - private File structuralVariantFile; - private int geneticProfileId; - private String genePanel; - private Set namespaces; - private Set sampleSet = new HashSet<>(); + private final File structuralVariantFile; + private final int geneticProfileId; + private final Integer genePanelId; + private final Set namespaces; + + private final boolean updateMode; public ImportStructuralVariantData( File structuralVariantFile, int geneticProfileId, String genePanel, - Set namespaces + Set namespaces, + boolean updateMode ) throws DaoException { this.structuralVariantFile = structuralVariantFile; this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); this.namespaces = namespaces; + this.updateMode = updateMode; } public void importData() throws IOException, DaoException { @@ -75,7 +78,7 @@ public void importData() throws IOException, DaoException { int recordCount = 0; // Genetic profile is read in first GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); - ArrayList orderedSampleList = new ArrayList(); + Set sampleIds = new HashSet<>(); long id = DaoStructuralVariant.getLargestInternalId(); Set uniqueSVs = new HashSet<>(); while ((line = buf.readLine()) != null) { @@ -175,27 +178,34 @@ public void importData() throws IOException, DaoException { // Add structural variant DaoStructuralVariant.addStructuralVariantToBulkLoader(structuralVariant); - // Add sample to sample profile list, which is important for gene panels - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId) && !sampleSet.contains(sample.getStableId())) { - if (genePanel != null) { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, GeneticProfileUtil.getGenePanelId(genePanel)); - } else { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, null); - } - } - sampleSet.add(sample.getStableId()); - orderedSampleList.add(sample.getInternalId()); + sampleIds.add(sample.getInternalId()); } } } } - DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + // TODO the dao methods could receive a set of sample ids (like the deletion does) instead of looping + if (updateMode) { + for (Integer sampleId : sampleIds) { + DaoSampleProfile.updateSampleProfile(sampleId, geneticProfileId, genePanelId); + } + DaoStructuralVariant.deleteStructuralVariants(geneticProfileId, sampleIds); + } else { + for (Integer sampleId : sampleIds) { + createSampleProfileIfNotExists(sampleId); + } + } buf.close(); MySQLbulkLoader.flushAll(); } - private CanonicalGene setCanonicalGene(long siteEntrezGeneId, String siteHugoSymbol, DaoGeneOptimized daoGene) { + private void createSampleProfileIfNotExists(int internalSampleId) throws DaoException { + if (!DaoSampleProfile.sampleExistsInGeneticProfile(internalSampleId, geneticProfileId)) { + DaoSampleProfile.addSampleProfile(internalSampleId, geneticProfileId, genePanelId); + } + } + + private CanonicalGene setCanonicalGene(long siteEntrezGeneId, String siteHugoSymbol, DaoGeneOptimized daoGene) { CanonicalGene siteCanonicalGene = null; // If the Entrez Gene Id is not "NA" set the canonical gene. diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java new file mode 100644 index 00000000..538f89cd --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java @@ -0,0 +1,127 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGenePanel; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.DaoSampleProfile; +import org.mskcc.cbio.portal.dao.DaoStructuralVariant; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ExtendedMutation; +import org.mskcc.cbio.portal.model.GenePanel; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.model.StructuralVariant; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.ArrayList; +import java.util.List; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; + +/** + * Tests Incremental Import of Structural Variants Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalStructuralVariantsImport { + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + /** + * Test incremental upload of SV data + */ + @Test + public void testIncrementalUpload() throws DaoException { + GeneticProfile svGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_structural_variants"); + assertNotNull(svGeneticProfile); + String svDataSampleId = "TCGA-A1-A0SE-01"; + /** + * this sample does not have SV data attached + */ + Sample svDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), svDataSampleId); + + StructuralVariant structuralVariant = new StructuralVariant(); + structuralVariant.setSampleIdInternal(svDataSample.getInternalId()); + structuralVariant.setGeneticProfileId(svGeneticProfile.getGeneticProfileId()); + structuralVariant.setAnnotation("TESTANNOT"); + structuralVariant.setDriverFilter("DRVFILTER"); + structuralVariant.setSite1RegionNumber(1); + structuralVariant.setSite2RegionNumber(2); + structuralVariant.setComments("This record has to be overwritten"); + DaoStructuralVariant.addStructuralVariantToBulkLoader(structuralVariant); + DaoSampleProfile.addSampleProfile(svDataSample.getInternalId(), svGeneticProfile.getGeneticProfileId(), null); + MySQLbulkLoader.flushAll(); + + File singleTcgaSampleFolder = new File("src/test/resources/incremental/structural_variants/"); + File metaFile = new File(singleTcgaSampleFolder, "meta_structural_variants.txt"); + File dataFile = new File(singleTcgaSampleFolder, "data_structural_variants.txt"); + + ImportProfileData importProfileData = new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importProfileData.run(); + + List structuralVariants = DaoStructuralVariant.getAllStructuralVariants(); + assertEquals(3, structuralVariants.size()); + Set.of("site1_test_desc_1", "site1_test_desc_2", "site1_test_desc_3").forEach(site1Desc -> { + Optional osv = structuralVariants.stream() + .filter(sv -> site1Desc.equals(sv.getSite1Description()) + && sv.getSampleIdInternal() == svDataSample.getInternalId() + && sv.getGeneticProfileId() == svGeneticProfile.getGeneticProfileId()).findFirst(); + assertTrue(osv.isPresent()); + assertNotNull(osv.get().getDriverFilter()); + }); + GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLSV"); + assertEquals("Sample profile has to point to TSTGNPNLSV panel", + genePanel.getInternalId(), + DaoSampleProfile.getPanelId(svDataSample.getInternalId(), svGeneticProfile.getGeneticProfileId())); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportStructuralVariantData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportStructuralVariantData.java index 76a58f85..2f91a779 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportStructuralVariantData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportStructuralVariantData.java @@ -91,7 +91,7 @@ public void testImportStructuralVariantData() throws DaoException, IOException { // Load test structural variants File file = new File("src/test/resources/data_structural_variants.txt"); - ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, noNamespaces); + ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, noNamespaces, false); importer.importData(); MySQLbulkLoader.flushAll(); @@ -133,7 +133,7 @@ public void testImportStructuralVariantDataImportsCustomNamespacesFromTwoSamples // Load test structural variants File file = new File("src/test/resources/data_structural_variants.txt"); Set namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2"); - ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport); + ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false); importer.importData(); MySQLbulkLoader.flushAll(); @@ -159,7 +159,7 @@ public void testImportStructuralVariantDataIgnoresUnspecifiedNamespaces() throws // Load test structural variants File file = new File("src/test/resources/data_structural_variants_with_unspecified_namespace.txt"); Set namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2"); - ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport); + ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false); importer.importData(); MySQLbulkLoader.flushAll(); @@ -182,7 +182,7 @@ public void testImportStructuralVariantDataWithNoNamespaceData() throws DaoExcep // Load test structural variants File file = new File("src/test/resources/data_structural_variants_with_no_namespace_data.txt"); Set namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2"); - ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport); + ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false); importer.importData(); MySQLbulkLoader.flushAll(); diff --git a/src/test/resources/incremental/structural_variants/data_structural_variants.txt b/src/test/resources/incremental/structural_variants/data_structural_variants.txt new file mode 100644 index 00000000..7514bce6 --- /dev/null +++ b/src/test/resources/incremental/structural_variants/data_structural_variants.txt @@ -0,0 +1,4 @@ +Sample_Id Site1_Entrez_Gene_Id Site1_Hugo_Symbol Site1_Ensembl_Transcript_Id Site1_Region_Number Site1_Chromosome Site1_Position Site1_Region Site1_Description Site2_Entrez_Gene_Id Site2_Hugo_Symbol Site2_Ensembl_Transcript_Id Site2_Region_Number Site2_Chromosome Site2_Position Site2_Contig Site2_Region Site2_Description Site2_Effect_On_Frame NCBI_Build DNA_Support RNA_Support Normal_Read_Count Tumor_Read_Count Normal_Variant_Count Tumor_Variant_Count Normal_Paired_End_Read_Count Tumor_Paired_End_Read_Count Normal_Split_Read_Count Tumor_Split_Read_Count Annotation Breakpoint_Type Center Connection_Type Event_Info Class SV_Length Comments External_Annotation cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation SV_Status StructVarNs.column1 StructVarNs2.lorem StructVarNs.column2 +TCGA-A1-A0SE-01 NA AKT1 ENST00000242365 15 7 138536968 EXON site1_test_desc_1 NA BRCA1 ENST00000288602 10 7 140482957 EXON PIEZO1-NCOA4.PIEZO1.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA PIEZO1-NCOA4.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Foo Class 4 Class annotation SOMATIC value1 ipsum value2 +TCGA-A1-A0SE-01 NA AKT2 ENST00000242365 15 7 138536968 EXON site1_test_desc_2 NA BRAF ENST00000288602 10 7 140482957 EXON KIAA1549-BRAF.K16B10.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA KIAA1549-BRAF.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Class 4 Class annotation SOMATIC value1 ipsum value2 +TCGA-A1-A0SE-01 NA AKT3 ENST00000344348 7 10 51582939 EXON site1_test_desc_3 NA BRCA2 ENST00000340058 12 10 43612031 EXON NCOA4-RET.N7R12_2 NA GRCh37 no yes NA 1001 NA 800 NA NA NA NA NCOA4-RET.N7R1 NA NA NA Fusion NA NA Gain-of-Function NA Putative_Passenger Test driver Class 3 Class annotation SOMATIC NA NA NA diff --git a/src/test/resources/incremental/structural_variants/meta_structural_variants.txt b/src/test/resources/incremental/structural_variants/meta_structural_variants.txt new file mode 100644 index 00000000..0998ac6e --- /dev/null +++ b/src/test/resources/incremental/structural_variants/meta_structural_variants.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: STRUCTURAL_VARIANT +datatype: SV +data_filename: data_structural_variants.txt +stable_id: structural_variants +profile_name: Test Targeted Fusion Assay data +profile_description: Test Targeted Fusion Assay data description +show_profile_in_analysis_tab: true +gene_panel: TSTGNPNLSV +namespaces: StructVarNs,StructVarNs2 diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index be554472..5ffe18da 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -404,6 +404,7 @@ INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_ INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (2,'TSTGNPNLCNADS','The CNA Discrete test panel'); INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (3,'TSTGNPNLMUTEXT','The mutation extended test panel'); INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (4,'TSTGNPNLGENASS','The generic assay test panel'); +INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (5,'TSTGNPNLSV','The structural variance test panel'); -- genetic_alteration INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (2,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 10000),'0,0,1,2,0,1,1,1,0,1,1,1,0,1,'); diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index 577c2a85..05d0002f 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -117,6 +117,8 @@ def test_incremental_load(self, run_java, locate_jar): '--meta', f'{data_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_methylation_hm27.txt', '--noprogress') treatment_ic50_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_treatment_ic50.txt', '--noprogress') + sv_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_structural_variants.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_structural_variants.txt', '--noprogress') timeline_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportTimelineData', '--overwrite-existing', '--meta', f'{data_directory}/meta_timeline.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_timeline.txt', '--noprogress') case_list_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', @@ -134,6 +136,7 @@ def test_incremental_load(self, run_java, locate_jar): expression_median_call, methylation_hm27_call, treatment_ic50_call, + sv_call, timeline_call, gene_panel_matrix_call, case_list_call, diff --git a/tests/test_data/study_es_0_inc/data_structural_variants.txt b/tests/test_data/study_es_0_inc/data_structural_variants.txt new file mode 100644 index 00000000..bd395c93 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_structural_variants.txt @@ -0,0 +1,10 @@ +Sample_Id Site1_Entrez_Gene_Id Site1_Hugo_Symbol Site1_Ensembl_Transcript_Id Site1_Region_Number Site1_Chromosome Site1_Position Site1_Region Site1_Description Site2_Entrez_Gene_Id Site2_Hugo_Symbol Site2_Ensembl_Transcript_Id Site2_Region_Number Site2_Chromosome Site2_Position Site2_Contig Site2_Region Site2_Description Site2_Effect_On_Frame NCBI_Build DNA_Support RNA_Support Normal_Read_Count Tumor_Read_Count Normal_Variant_Count Tumor_Variant_Count Normal_Paired_End_Read_Count Tumor_Paired_End_Read_Count Normal_Split_Read_Count Tumor_Split_Read_Count Annotation Breakpoint_Type Center Connection_Type Event_Info Class SV_Length Comments External_Annotation cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation SV_Status StructVarNs.column1 StructVarNs2.lorem StructVarNs.column2 +TCGA-BH-NEW NA PIEZO1 ENST00000242365 15 7 138536968 EXON PIEZO1-NCOA4.K16B10.COSF509_1 NA NCOA4 ENST00000288602 10 7 140482957 EXON PIEZO1-NCOA4.PIEZO1.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA PIEZO1-NCOA4.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Foo Class 4 Class annotation SOMATIC value1 ipsum value2 +TCGA-BH-NEW NA KIAA1549 ENST00000242365 15 7 138536968 EXON KIAA1549-BRAF.K16B10.COSF509_1 NA BRAF ENST00000288602 10 7 140482957 EXON KIAA1549-BRAF.K16B10.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA KIAA1549-BRAF.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Class 4 Class annotation SOMATIC value1 ipsum value2 +TCGA-A1-A0SB-03 NA NCOA4 ENST00000344348 7 10 51582939 EXON NCOA4-RET.N7R12_1 NA RET ENST00000340058 12 10 43612031 EXON NCOA4-RET.N7R12_2 NA GRCh37 no yes NA 1001 NA 800 NA NA NA NA NCOA4-RET.N7R1 NA NA NA Fusion NA NA Gain-of-Function NA Putative_Passenger Test driver Class 3 Class annotation SOMATIC +TCGA-BH-NEW NA EML4 ENST00000318522 6 2 42492091 EXON EML4-ALK.E6bA20.AB374362_1 NA ALK ENST00000389048 20 2 29446394 EXON EML4-ALK.E6bA20.AB374362_2 NA GRCh37 no yes NA 1002 NA 700 NA NA NA NA EML4-ALK.E6bA20.AB374362 NA NA NA Fusion NA NA Gain-of-Function GENBANK:AB374362 Putative_Driver Test driver Class 2 Class annotation SOMATIC +TCGA-BH-NEW NA TMPRSS2 ENST00000332149 1 21 42880007 EXON TMPRSS2-ERG.T1E2.COSF23.1_1 NA ERG ENST00000442448 2 21 39956869 EXON TMPRSS2-ERG.T1E2.COSF23.1_2 NA GRCh37 no yes NA 1003 NA 600 NA NA NA NA TMPRSS2-ERG.T1E2.COSF23.1 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF23 Unknown Test driver Class 1 Class annotation SOMATIC +TCGA-A1-A0SB-01 NA EGFR ENST00000275493 1 7 55087058 EXON EGFR-EGFR.E1E8.DelPositive.1_1 NA EGFR ENST00000275493 8 7 55223522 EXON EGFR-EGFR.E1E8.DelPositive.1_2 NA GRCh37 no yes NA 1004 NA 500 NA NA NA NA EGFR-EGFR.E1E8.DelPositive NA NA NA Fusion NA NA NA NA Putative_Driver Test driver Unknown Class annotation SOMATIC +TCGA-BH-NEW NA ALK ENST00000389048 11 2 29497964 EXON ALK-PTPN3.A11P3_1 NA PTPN3 ENST00000374541 3 9 112219679 EXON ALK-PTPN3.A11P3_2 NA GRCh37 no yes NA 1005 NA 400 NA NA NA NA ALK-PTPN3.A11P3 NA NA NA Fusion NA NA NA NA NA NA NA NA SOMATIC +TCGA-A1-A0SB-01 NA EML4 ENST00000318522 13 2 42522656 EXON EML4-ALK.E13A20.AB462411_1 NA ALK ENST00000389048 20 2 29446335 EXON EML4-ALK.E13A20.AB462411_2 NA GRCh37 no yes NA 1006 NA 300 NA NA NA NA EML4-ALK.E13A20 NA NA NA Fusion NA NA Gain-of-Function GENBANK:AB462411 NA NA NA NA SOMATIC +TCGA-A1-A0SB-03 NA TMPRSS2 ENST00000455813 1 21 42870045 EXON TMPRSS2-ETV1.T1bE4_1 NA ETV1 ENST00000405358 4 7 14017105 EXON TMPRSS2-ETV1.T1bE4_2 NA GRCh37 no yes NA 1007 NA 200 NA NA NA NA TMPRSS2-ETV1.T1bE4 NA NA NA Fusion NA NA NA NA NA NA NA NA SOMATIC diff --git a/tests/test_data/study_es_0_inc/meta_structural_variants.txt b/tests/test_data/study_es_0_inc/meta_structural_variants.txt new file mode 100644 index 00000000..b62d3cbd --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_structural_variants.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: STRUCTURAL_VARIANT +datatype: SV +data_filename: data_structural_variants.txt +stable_id: structural_variants +profile_name: Targeted Fusion Assay data (Fake data) +profile_description: Targeted Fusion Assay data +show_profile_in_analysis_tab: true +gene_panel: TESTPANEL1 +namespaces: StructVarNs,StructVarNs2 From bea498716593e9d1b668fe815542805fe39fbdf5 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Sat, 25 May 2024 23:14:52 +0200 Subject: [PATCH 086/130] Implement incremental upload of CNA segmented data --- scripts/importer/cbioportal_common.py | 1 + .../cbio/portal/dao/DaoClinicalData.java | 27 +++- .../cbio/portal/dao/DaoCopyNumberSegment.java | 41 +++++- .../portal/dao/DaoCopyNumberSegmentFile.java | 5 +- .../scripts/ImportCopyNumberSegmentData.java | 34 ++++- ...ncrementalCopyNumberSegmentDataImport.java | 121 ++++++++++++++++++ .../copy_number_alteration/data_cna.seg | 10 ++ .../copy_number_alteration/meta_cna_seg.txt | 6 + tests/system_tests_import_data.py | 3 + .../study_es_0_inc/data_cna_hg19.seg | 10 ++ .../study_es_0_inc/meta_cna_hg19_seg.txt | 6 + 11 files changed, 250 insertions(+), 14 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberSegmentDataImport.java create mode 100644 src/test/resources/incremental/copy_number_alteration/data_cna.seg create mode 100644 src/test/resources/incremental/copy_number_alteration/meta_cna_seg.txt create mode 100644 tests/test_data/study_es_0_inc/data_cna_hg19.seg create mode 100644 tests/test_data/study_es_0_inc/meta_cna_hg19_seg.txt diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index 5f5895d1..9fda27d6 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -383,6 +383,7 @@ class MetaFileTypes(object): MetaFileTypes.TIMELINE, MetaFileTypes.GENE_PANEL_MATRIX, MetaFileTypes.STRUCTURAL_VARIANT, + MetaFileTypes.SEG, ] IMPORTER_CLASSNAME_BY_META_TYPE = { diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java index 8a5aaf30..4ac8f1e6 100755 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java @@ -370,7 +370,6 @@ public static List getSampleData(int cancerStudyId, Collection sampleInternalIds, String attrId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + try { + con = JdbcUtil.getDbConnection(DaoClinicalData.class); + pstmt = con.prepareStatement("DELETE FROM " + SAMPLE_ATTRIBUTES_TABLE + + " WHERE `ATTR_ID` = ? AND `INTERNAL_ID` IN (" + + String.join(",", Collections.nCopies(sampleInternalIds.size(), "?")) + + ")"); + int parameterIndex = 1; + pstmt.setString(parameterIndex++, attrId); + for (Integer sampleInternalId : sampleInternalIds) { + pstmt.setInt(parameterIndex++, sampleInternalId); + } + pstmt.executeUpdate(); + } + catch (SQLException e) { + throw new DaoException(e); + } + finally { + JdbcUtil.closeAll(DaoClinicalData.class, con, pstmt, null); } } diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java index a0113a44..a71166f7 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java @@ -67,7 +67,7 @@ public static int addCopyNumberSegment(CopyNumberSegment seg) throws DaoExceptio } } - public static void createFractionGenomeAlteredClinicalData(int cancerStudyId) throws DaoException { + public static void createFractionGenomeAlteredClinicalData(int cancerStudyId, Set sampleIds, boolean updateMode) throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; @@ -80,8 +80,15 @@ public static void createFractionGenomeAlteredClinicalData(int cancerStudyId) th "AS c2 WHERE c2.`CANCER_STUDY_ID` = c1.`CANCER_STUDY_ID` AND c2.`SAMPLE_ID` = c1.`SAMPLE_ID` AND " + "ABS(c2.`SEGMENT_MEAN`) >= 0.2) / SUM(`END`-`START`)) AS `VALUE` FROM `copy_number_seg` AS c1 , `cancer_study` " + "WHERE c1.`CANCER_STUDY_ID` = cancer_study.`CANCER_STUDY_ID` AND cancer_study.`CANCER_STUDY_ID`=? " + - "GROUP BY cancer_study.`CANCER_STUDY_ID` , `SAMPLE_ID` HAVING SUM(`END`-`START`) > 0;"); - pstmt.setInt(1, cancerStudyId); + (sampleIds == null ? "" : ("AND `SAMPLE_ID` IN ("+ String.join(",", Collections.nCopies(sampleIds.size(), "?")) + ") ")) + +"GROUP BY cancer_study.`CANCER_STUDY_ID` , `SAMPLE_ID` HAVING SUM(`END`-`START`) > 0;"); + int parameterIndex = 1; + pstmt.setInt(parameterIndex++, cancerStudyId); + if (sampleIds != null) { + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); + } + } Map fractionGenomeAltereds = new HashMap(); rs = pstmt.executeQuery(); while (rs.next()) { @@ -94,7 +101,10 @@ public static void createFractionGenomeAlteredClinicalData(int cancerStudyId) th false, "20", cancerStudyId); DaoClinicalAttributeMeta.addDatum(attr); } - + + if (updateMode) { + DaoClinicalData.removeSampleAttributesData(fractionGenomeAltereds.keySet(), FRACTION_GENOME_ALTERED_ATTR_ID); + } for (Map.Entry fractionGenomeAltered : fractionGenomeAltereds.entrySet()) { DaoClinicalData.addSampleDatum(fractionGenomeAltered.getKey(), FRACTION_GENOME_ALTERED_ATTR_ID, fractionGenomeAltered.getValue()); } @@ -283,4 +293,27 @@ public static boolean segmentDataExistForSample(int cancerStudyId, int sampleId) JdbcUtil.closeAll(DaoCopyNumberSegment.class, con, pstmt, rs); } } + + public static void deleteSegmentDataForSamples(int cancerStudyId, Set sampleIds) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoCopyNumberSegment.class); + pstmt = con.prepareStatement("DELETE FROM `copy_number_seg`" + + " WHERE `CANCER_STUDY_ID`= ?" + + " AND `SAMPLE_ID` IN (" + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + + ")"); + int parameterIndex = 1; + pstmt.setInt(parameterIndex++, cancerStudyId); + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); + } + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoCopyNumberSegment.class, con, pstmt, rs); + } + } } diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegmentFile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegmentFile.java index ef0011a4..cf2332f6 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegmentFile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegmentFile.java @@ -65,7 +65,7 @@ public static int addCopyNumberSegmentFile(CopyNumberSegmentFile copySegFile) th } catch (SQLException e) { throw new DaoException(e); } finally { - JdbcUtil.closeAll(DaoCopyNumberSegment.class, con, pstmt, rs); + JdbcUtil.closeAll(DaoCopyNumberSegmentFile.class, con, pstmt, rs); } } @@ -86,6 +86,9 @@ public static CopyNumberSegmentFile getCopyNumberSegmentFile(int cancerStudyId) cnsf.referenceGenomeId = CopyNumberSegmentFile.ReferenceGenomeId.valueOf(rs.getString("REFERENCE_GENOME_ID")); cnsf.description = rs.getString("DESCRIPTION"); cnsf.filename = rs.getString("FILENAME"); + if (rs.next()) { + throw new SQLException("More than one row was returned."); + } return cnsf; } return null; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java index 92343aa3..1fb5c0d0 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java @@ -55,7 +55,9 @@ import java.io.FileReader; import java.io.IOException; import java.math.BigDecimal; +import java.util.HashSet; import java.util.Properties; +import java.util.Set; /** * Import Segment data into database. @@ -64,7 +66,9 @@ public class ImportCopyNumberSegmentData extends ConsoleRunnable { private int entriesSkipped; - + private boolean updateMode; + private Set processedSampleIds; + private void importData(File file, int cancerStudyId) throws IOException, DaoException { MySQLbulkLoader.bulkLoadOn(); FileReader reader = new FileReader(file); @@ -72,6 +76,7 @@ private void importData(File file, int cancerStudyId) throws IOException, DaoExc try { String line = buf.readLine(); // skip header line long segId = DaoCopyNumberSegment.getLargestId(); + processedSampleIds = new HashSet<>(); while ((line=buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); @@ -81,8 +86,7 @@ private void importData(File file, int cancerStudyId) throws IOException, DaoExc System.err.println("wrong format: "+line); } - CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByInternalId(cancerStudyId); - String chrom = strs[1].trim(); + String chrom = strs[1].trim(); //validate in same way as GistitReader: ValidationUtils.validateChromosome(chrom); @@ -112,6 +116,10 @@ private void importData(File file, int cancerStudyId) throws IOException, DaoExc CopyNumberSegment cns = new CopyNumberSegment(cancerStudyId, s.getInternalId(), chrom, start, end, numProbes, segMean); cns.setSegId(++segId); DaoCopyNumberSegment.addCopyNumberSegment(cns); + processedSampleIds.add(s.getInternalId()); + } + if (updateMode) { + DaoCopyNumberSegment.deleteSegmentDataForSamples(cancerStudyId, processedSampleIds); } MySQLbulkLoader.flushAll(); } @@ -127,6 +135,7 @@ public void run() { OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); String dataFile = (String) options.valueOf("data"); File descriptorFile = new File((String) options.valueOf("meta")); + updateMode = options.has("overwrite-existing"); Properties properties = new Properties(); properties.load(new FileInputStream(descriptorFile)); @@ -135,13 +144,13 @@ public void run() { CancerStudy cancerStudy = getCancerStudy(properties); - if (segmentDataExistsForCancerStudy(cancerStudy)) { + if (!updateMode && segmentDataExistsForCancerStudy(cancerStudy)) { throw new IllegalArgumentException("Seg data for cancer study " + cancerStudy.getCancerStudyStableId() + " has already been imported: " + dataFile); } importCopyNumberSegmentFileMetadata(cancerStudy, properties); importCopyNumberSegmentFileData(cancerStudy, dataFile); - DaoCopyNumberSegment.createFractionGenomeAlteredClinicalData(cancerStudy.getInternalId()); + DaoCopyNumberSegment.createFractionGenomeAlteredClinicalData(cancerStudy.getInternalId(), processedSampleIds, updateMode); if( MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); } @@ -164,7 +173,7 @@ private static boolean segmentDataExistsForCancerStudy(CancerStudy cancerStudy) return (DaoCopyNumberSegment.segmentDataExistForCancerStudy(cancerStudy.getInternalId())); } - private static void importCopyNumberSegmentFileMetadata(CancerStudy cancerStudy, Properties properties) throws DaoException { + private void importCopyNumberSegmentFileMetadata(CancerStudy cancerStudy, Properties properties) throws DaoException { CopyNumberSegmentFile copyNumSegFile = new CopyNumberSegmentFile(); copyNumSegFile.cancerStudyId = cancerStudy.getInternalId(); String referenceGenomeId = properties.getProperty("reference_genome_id").trim(); @@ -179,7 +188,18 @@ private static void importCopyNumberSegmentFileMetadata(CancerStudy cancerStudy, copyNumSegFile.referenceGenomeId = getRefGenId(referenceGenomeId); copyNumSegFile.description = properties.getProperty("description").trim(); copyNumSegFile.filename = properties.getProperty("data_filename").trim(); - DaoCopyNumberSegmentFile.addCopyNumberSegmentFile(copyNumSegFile); + CopyNumberSegmentFile storedCopyNumSegFile = DaoCopyNumberSegmentFile.getCopyNumberSegmentFile(cancerStudy.getInternalId()); + if (updateMode && storedCopyNumSegFile != null) { + if (storedCopyNumSegFile.referenceGenomeId != copyNumSegFile.referenceGenomeId) { + throw new IllegalStateException("You are trying to upload " + + copyNumSegFile.referenceGenomeId + + " reference genome data into " + + storedCopyNumSegFile.referenceGenomeId + + " reference genome data."); + } + } else { + DaoCopyNumberSegmentFile.addCopyNumberSegmentFile(copyNumSegFile); + } } private void importCopyNumberSegmentFileData(CancerStudy cancerStudy, String dataFilename) throws IOException, DaoException { diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberSegmentDataImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberSegmentDataImport.java new file mode 100644 index 00000000..db2ee519 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberSegmentDataImport.java @@ -0,0 +1,121 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoClinicalData; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegment; +import org.mskcc.cbio.portal.dao.DaoCopyNumberSegmentFile; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.dao.MySQLbulkLoader; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.ClinicalData; +import org.mskcc.cbio.portal.model.CopyNumberSegment; +import org.mskcc.cbio.portal.model.CopyNumberSegmentFile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.List; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +/** + * Tests Incremental Import of CNA segmented data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalCopyNumberSegmentDataImport { + + /** + * Test incremental upload of CNA SEG data + */ + @Test + public void testIncrementalUpload() throws DaoException { + String segSampleId = "TCGA-A1-A0SE-01"; + Sample segDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), segSampleId); + + CopyNumberSegmentFile copyNumberSegmentFile = new CopyNumberSegmentFile(); + copyNumberSegmentFile.cancerStudyId = cancerStudy.getInternalId(); + copyNumberSegmentFile.referenceGenomeId = CopyNumberSegmentFile.ReferenceGenomeId.hg19; + copyNumberSegmentFile.segFileId = 1; + copyNumberSegmentFile.filename = "test_file.seg"; + copyNumberSegmentFile.description = "test seg file description"; + DaoCopyNumberSegmentFile.addCopyNumberSegmentFile(copyNumberSegmentFile); + DaoClinicalData.addSampleDatum(segDataSample.getInternalId(), "FRACTION_GENOME_ALTERED", "TEST"); + MySQLbulkLoader.bulkLoadOn(); + CopyNumberSegment copyNumberSegment = new CopyNumberSegment( + cancerStudy.getInternalId(), + segDataSample.getInternalId(), + "1", + 3218610, + 95674710, + 100, + 0.01); + copyNumberSegment.setSegId(1L); + DaoCopyNumberSegment.addCopyNumberSegment(copyNumberSegment); + MySQLbulkLoader.flushAll(); + + File dataFolder = new File("src/test/resources/incremental/copy_number_alteration/"); + File metaFile = new File(dataFolder, "meta_cna_seg.txt"); + File dataFile = new File(dataFolder, "data_cna.seg"); + + ImportCopyNumberSegmentData importCnaSegData = new ImportCopyNumberSegmentData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }); + importCnaSegData.run(); + + CopyNumberSegmentFile fetchedCopyNumberSegmentFile = DaoCopyNumberSegmentFile.getCopyNumberSegmentFile(cancerStudy.getInternalId()); + assertNotNull(fetchedCopyNumberSegmentFile); + assertEquals("test_file.seg", fetchedCopyNumberSegmentFile.filename); + List cnaSegments = DaoCopyNumberSegment + .getSegmentForASample(segDataSample.getInternalId(), cancerStudy.getInternalId()); + assertEquals(9, cnaSegments.size()); + List clinicalData = DaoClinicalData.getSampleData(cancerStudy.getInternalId(), Set.of(segSampleId)); + ClinicalData fractionGenomeAltered = clinicalData.stream() + .filter(cd -> "FRACTION_GENOME_ALTERED".equals(cd.getAttrId())).findFirst().get(); + assertEquals("0.0000", fractionGenomeAltered.getAttrVal()); + } + + public static final String STUDY_ID = "study_tcga_pub"; + private CancerStudy cancerStudy; + + @Before + public void setUp() throws DaoException { + cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID); + } + +} diff --git a/src/test/resources/incremental/copy_number_alteration/data_cna.seg b/src/test/resources/incremental/copy_number_alteration/data_cna.seg new file mode 100644 index 00000000..fd1be197 --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/data_cna.seg @@ -0,0 +1,10 @@ +ID chrom loc.start loc.end num.mark seg.mean +TCGA-A1-A0SE-01 1 3218610 95674710 53225 0.0055 +TCGA-A1-A0SE-01 1 95676511 95676518 2 -1.6636 +TCGA-A1-A0SE-01 1 95680124 167057183 24886 0.0053 +TCGA-A1-A0SE-01 1 167057495 167059336 3 -1.0999 +TCGA-A1-A0SE-01 1 167059760 181602002 9213 -8e-04 +TCGA-A1-A0SE-01 1 181603120 181609567 6 -1.2009 +TCGA-A1-A0SE-01 1 181610685 201473647 12002 0.0055 +TCGA-A1-A0SE-01 1 201474400 201474544 2 -1.4235 +TCGA-A1-A0SE-01 1 201475220 247813706 29781 -4e-04 diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_seg.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_seg.txt new file mode 100644 index 00000000..61d86a9a --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_seg.txt @@ -0,0 +1,6 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: SEG +reference_genome_id: hg19 +description: Test somatic CNA data +data_filename: data_cna.seg diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index 05d0002f..9c30cdef 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -125,6 +125,8 @@ def test_incremental_load(self, run_java, locate_jar): '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists') gene_panel_matrix_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportGenePanelProfileMap', '--overwrite-existing', '--meta', f'{data_directory}/meta_gene_panel_matrix.txt', '--data', f'{data_directory}/data_gene_panel_matrix.txt', '--noprogress') + seg_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportCopyNumberSegmentData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_hg19_seg.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_cna_hg19.seg', '--noprogress') self.assertCountEqual(run_java.call_args_list, [ call(*common_part, 'org.mskcc.cbio.portal.util.VersionUtil',), @@ -139,6 +141,7 @@ def test_incremental_load(self, run_java, locate_jar): sv_call, timeline_call, gene_panel_matrix_call, + seg_call, case_list_call, ]) diff --git a/tests/test_data/study_es_0_inc/data_cna_hg19.seg b/tests/test_data/study_es_0_inc/data_cna_hg19.seg new file mode 100644 index 00000000..4c149a9c --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_hg19.seg @@ -0,0 +1,10 @@ +ID chrom loc.start loc.end num.mark seg.mean +TCGA-A2-A04P-01 1 3218610 95674710 53225 0.0055 +TCGA-A2-A04P-01 1 95676511 95676518 2 -1.6636 +TCGA-A2-A04P-01 1 95680124 167057183 24886 0.0053 +TCGA-A1-A0SB-01 1 167057495 167059336 3 -1.0999 +TCGA-A1-A0SB-01 1 167059760 181602002 9213 -8e-04 +TCGA-A1-A0SB-03 1 181603120 181609567 6 -1.2009 +TCGA-A1-A0SB-03 1 181610685 201473647 12002 0.0055 +TCGA-BH-NEW-01 1 201474400 201474544 2 -1.4235 +TCGA-BH-NEW-01 1 201475220 247813706 29781 -4e-04 diff --git a/tests/test_data/study_es_0_inc/meta_cna_hg19_seg.txt b/tests/test_data/study_es_0_inc/meta_cna_hg19_seg.txt new file mode 100644 index 00000000..f17e1657 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_hg19_seg.txt @@ -0,0 +1,6 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: SEG +reference_genome_id: hg19 +description: Somatic CNA data (copy number ratio from tumor samples minus ratio from matched normals) from TCGA. +data_filename: data_cna_hg19.seg From 0cdda9dcd88effa107bc0f441568196d7e49f916 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 19:47:40 +0200 Subject: [PATCH 087/130] Make it explicit that timeline uploader support bulk mode only --- .../mskcc/cbio/portal/scripts/ImportTimelineData.java | 9 +++++++-- .../java/org/mskcc/cbio/portal/util/ConsoleUtil.java | 4 ---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java index c16eba21..6556c591 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java @@ -135,8 +135,13 @@ public void run() { try { String description = "Import 'timeline' data"; - OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, false); - String dataFile = (String) options.valueOf("data"); + OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); + if (options.has("loadMode") && !"bulkLoad".equals(options.valueOf("loadMode"))) { + throw new UnsupportedOperationException("This loader supports bulkLoad load mode only, but " + + options.valueOf("loadMode") + + " has been supplied."); + } + String dataFile = (String) options.valueOf("data"); File descriptorFile = new File((String) options.valueOf("meta")); boolean overwriteExisting = options.has("overwrite-existing"); diff --git a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java index 7eba9610..f53c7ba2 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java @@ -178,10 +178,6 @@ public static OptionSet parseStandardDataAndMetaOptions(String[] args, String de "Error: unknown loadMode action: " + actionArg); } } - else { - throw new UsageException(progName, description, parser, - "Error: 'loadMode' argument required."); - } } return options; } From d7e8ff301b526410bd88900e534e9d69ed900200 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 19:57:39 +0200 Subject: [PATCH 088/130] Fix number of columns in SV tsv data file --- tests/test_data/study_es_0_inc/data_structural_variants.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_data/study_es_0_inc/data_structural_variants.txt b/tests/test_data/study_es_0_inc/data_structural_variants.txt index bd395c93..db82553f 100644 --- a/tests/test_data/study_es_0_inc/data_structural_variants.txt +++ b/tests/test_data/study_es_0_inc/data_structural_variants.txt @@ -1,10 +1,10 @@ Sample_Id Site1_Entrez_Gene_Id Site1_Hugo_Symbol Site1_Ensembl_Transcript_Id Site1_Region_Number Site1_Chromosome Site1_Position Site1_Region Site1_Description Site2_Entrez_Gene_Id Site2_Hugo_Symbol Site2_Ensembl_Transcript_Id Site2_Region_Number Site2_Chromosome Site2_Position Site2_Contig Site2_Region Site2_Description Site2_Effect_On_Frame NCBI_Build DNA_Support RNA_Support Normal_Read_Count Tumor_Read_Count Normal_Variant_Count Tumor_Variant_Count Normal_Paired_End_Read_Count Tumor_Paired_End_Read_Count Normal_Split_Read_Count Tumor_Split_Read_Count Annotation Breakpoint_Type Center Connection_Type Event_Info Class SV_Length Comments External_Annotation cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation SV_Status StructVarNs.column1 StructVarNs2.lorem StructVarNs.column2 TCGA-BH-NEW NA PIEZO1 ENST00000242365 15 7 138536968 EXON PIEZO1-NCOA4.K16B10.COSF509_1 NA NCOA4 ENST00000288602 10 7 140482957 EXON PIEZO1-NCOA4.PIEZO1.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA PIEZO1-NCOA4.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Foo Class 4 Class annotation SOMATIC value1 ipsum value2 TCGA-BH-NEW NA KIAA1549 ENST00000242365 15 7 138536968 EXON KIAA1549-BRAF.K16B10.COSF509_1 NA BRAF ENST00000288602 10 7 140482957 EXON KIAA1549-BRAF.K16B10.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA KIAA1549-BRAF.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Class 4 Class annotation SOMATIC value1 ipsum value2 -TCGA-A1-A0SB-03 NA NCOA4 ENST00000344348 7 10 51582939 EXON NCOA4-RET.N7R12_1 NA RET ENST00000340058 12 10 43612031 EXON NCOA4-RET.N7R12_2 NA GRCh37 no yes NA 1001 NA 800 NA NA NA NA NCOA4-RET.N7R1 NA NA NA Fusion NA NA Gain-of-Function NA Putative_Passenger Test driver Class 3 Class annotation SOMATIC +TCGA-A1-A0SB-03 NA NCOA4 ENST00000344348 7 10 51582939 EXON NCOA4-RET.N7R12_1 NA RET ENST00000340058 12 10 43612031 EXON NCOA4-RET.N7R12_2 NA GRCh37 no yes NA 1001 NA 800 NA NA NA NA NCOA4-RET.N7R1 NA NA NA Fusion NA NA Gain-of-Function NA Putative_Passenger Test driver Class 3 Class annotation SOMATIC TCGA-BH-NEW NA EML4 ENST00000318522 6 2 42492091 EXON EML4-ALK.E6bA20.AB374362_1 NA ALK ENST00000389048 20 2 29446394 EXON EML4-ALK.E6bA20.AB374362_2 NA GRCh37 no yes NA 1002 NA 700 NA NA NA NA EML4-ALK.E6bA20.AB374362 NA NA NA Fusion NA NA Gain-of-Function GENBANK:AB374362 Putative_Driver Test driver Class 2 Class annotation SOMATIC TCGA-BH-NEW NA TMPRSS2 ENST00000332149 1 21 42880007 EXON TMPRSS2-ERG.T1E2.COSF23.1_1 NA ERG ENST00000442448 2 21 39956869 EXON TMPRSS2-ERG.T1E2.COSF23.1_2 NA GRCh37 no yes NA 1003 NA 600 NA NA NA NA TMPRSS2-ERG.T1E2.COSF23.1 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF23 Unknown Test driver Class 1 Class annotation SOMATIC -TCGA-A1-A0SB-01 NA EGFR ENST00000275493 1 7 55087058 EXON EGFR-EGFR.E1E8.DelPositive.1_1 NA EGFR ENST00000275493 8 7 55223522 EXON EGFR-EGFR.E1E8.DelPositive.1_2 NA GRCh37 no yes NA 1004 NA 500 NA NA NA NA EGFR-EGFR.E1E8.DelPositive NA NA NA Fusion NA NA NA NA Putative_Driver Test driver Unknown Class annotation SOMATIC +TCGA-A1-A0SB-01 NA EGFR ENST00000275493 1 7 55087058 EXON EGFR-EGFR.E1E8.DelPositive.1_1 NA EGFR ENST00000275493 8 7 55223522 EXON EGFR-EGFR.E1E8.DelPositive.1_2 NA GRCh37 no yes NA 1004 NA 500 NA NA NA NA EGFR-EGFR.E1E8.DelPositive NA NA NA Fusion NA NA NA NA Putative_Driver Test driver Unknown Class annotation SOMATIC TCGA-BH-NEW NA ALK ENST00000389048 11 2 29497964 EXON ALK-PTPN3.A11P3_1 NA PTPN3 ENST00000374541 3 9 112219679 EXON ALK-PTPN3.A11P3_2 NA GRCh37 no yes NA 1005 NA 400 NA NA NA NA ALK-PTPN3.A11P3 NA NA NA Fusion NA NA NA NA NA NA NA NA SOMATIC TCGA-A1-A0SB-01 NA EML4 ENST00000318522 13 2 42522656 EXON EML4-ALK.E13A20.AB462411_1 NA ALK ENST00000389048 20 2 29446335 EXON EML4-ALK.E13A20.AB462411_2 NA GRCh37 no yes NA 1006 NA 300 NA NA NA NA EML4-ALK.E13A20 NA NA NA Fusion NA NA Gain-of-Function GENBANK:AB462411 NA NA NA NA SOMATIC TCGA-A1-A0SB-03 NA TMPRSS2 ENST00000455813 1 21 42870045 EXON TMPRSS2-ETV1.T1bE4_1 NA ETV1 ENST00000405358 4 7 14017105 EXON TMPRSS2-ETV1.T1bE4_2 NA GRCh37 no yes NA 1007 NA 200 NA NA NA NA TMPRSS2-ETV1.T1bE4 NA NA NA Fusion NA NA NA NA NA NA NA NA SOMATIC From ec849e22d32dd2d1a976449279e092e2d6fe123d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 10:04:37 +0200 Subject: [PATCH 089/130] Update paragraph on inc. upload in README --- README.md | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2acafd19..f16c9b82 100644 --- a/README.md +++ b/README.md @@ -57,10 +57,9 @@ To execute an incremental upload, use the -d (or --data_directory) option instea docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -d /data/study_es_0_inc -p /data/api_json -o ``` **Note:** -While the directory should adhere to the standard cBioPortal file formats and study structure, please note the following specific guidelines for incremental uploads: - -- Incremental uploads are not supported for all data types. For instance, uploading study metadata, resources, or GSVA data incrementally is currently unsupported. -- The data pertaining to patient or sample IDs should only include entries that are either new or need updates. +While the directory should adhere to the standard cBioPortal file formats and study structure. +Incremental uploads are not supported for all data types though. +For instance, uploading study metadata, resources, or GSVA data incrementally is currently unsupported. This method ensures efficient updates without the need for complete study reuploads, saving time and computational resources. From deb65cb7550a3a83f55e78022276b0f6d4552f32 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 10:17:56 +0200 Subject: [PATCH 090/130] Rename validation method to better describe it's purpose To really validate entrez id, we need to look it up --- .../java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java | 4 ++-- .../java/org/mskcc/cbio/portal/scripts/ImportGeneData.java | 4 ++-- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 4 ++-- .../java/org/mskcc/cbio/portal/util/DataValidator.java | 7 +++++++ .../java/org/mskcc/cbio/portal/util/EntrezValidator.java | 7 ------- 5 files changed, 13 insertions(+), 13 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/util/DataValidator.java delete mode 100644 src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java index 3adbfb53..9b7478ad 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java @@ -46,7 +46,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.util.EntrezValidator; +import org.mskcc.cbio.portal.util.DataValidator; import org.mskcc.cbio.portal.util.ProgressMonitor; /** @@ -323,7 +323,7 @@ public List guessGene(String geneId, String chr) { } CanonicalGene gene; - if (EntrezValidator.isaValidEntrezId(geneId)) { // likely to be a entrez gene id + if (DataValidator.isValidNumericSequence(geneId)) { // likely to be a entrez gene id gene = getGene(Integer.parseInt(geneId)); if (gene!=null) { return Collections.singletonList(gene); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java index 80a062c6..63032aee 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java @@ -45,7 +45,7 @@ import org.mskcc.cbio.portal.model.ReferenceGenome; import org.mskcc.cbio.portal.model.ReferenceGenomeGene; import org.mskcc.cbio.portal.util.ConsoleUtil; -import org.mskcc.cbio.portal.util.EntrezValidator; +import org.mskcc.cbio.portal.util.DataValidator; import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.GlobalProperties; import org.mskcc.cbio.portal.util.ProgressMonitor; @@ -205,7 +205,7 @@ public static void importHGNCData(File geneFile, String genomeBuild) throws IOEx ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); String parts[] = line.split("\t", -1); // include trailing empty strings - if (!EntrezValidator.isaValidEntrezId(parts[0])) { + if (!DataValidator.isValidNumericSequence(parts[0])) { ProgressMonitor.logWarning("Skipping gene with invalid entrez gene id '" + parts[1] + "'"); continue; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index e138bb8a..93166271 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -50,7 +50,7 @@ import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.util.CnaUtil; import org.mskcc.cbio.portal.util.ConsoleUtil; -import org.mskcc.cbio.portal.util.EntrezValidator; +import org.mskcc.cbio.portal.util.DataValidator; import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.GeneticProfileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; @@ -369,7 +369,7 @@ private void doImportData() throws IOException, DaoException { if (entrez != null && entrez.isEmpty()) { entrez = null; } - if (entrez != null && !EntrezValidator.isaValidEntrezId(entrez)) { + if (entrez != null && !DataValidator.isValidNumericSequence(entrez)) { ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); } else { String firstCellValue = rowParts[0]; diff --git a/src/main/java/org/mskcc/cbio/portal/util/DataValidator.java b/src/main/java/org/mskcc/cbio/portal/util/DataValidator.java new file mode 100644 index 00000000..1878f063 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/DataValidator.java @@ -0,0 +1,7 @@ +package org.mskcc.cbio.portal.util; + +public class DataValidator { + public static boolean isValidNumericSequence(String str) { + return str.matches("[0-9]+"); + } +} diff --git a/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java b/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java deleted file mode 100644 index 335bfd66..00000000 --- a/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java +++ /dev/null @@ -1,7 +0,0 @@ -package org.mskcc.cbio.portal.util; - -public class EntrezValidator { - public static boolean isaValidEntrezId(String entrez) { - return entrez.matches("[0-9]+"); - } -} From 8692ead4e201d47f045987957cca60536301716e Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 10:24:03 +0200 Subject: [PATCH 091/130] Fix cleaning alteration_driver_annotation table for specific sample --- src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java index f19bf514..acb29e16 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java @@ -130,6 +130,7 @@ public static void removeSampleCnaEvents(int cnaProfileId, List sampleI ("DELETE sample_cna_event, alteration_driver_annotation" + " FROM sample_cna_event" + " LEFT JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + + " AND alteration_driver_annotation.`SAMPLE_ID` = sample_cna_event.`SAMPLE_ID`" + " WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + ")"); From be9082cfd21429b25bc58331860c4af5c35d6b08 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 10:57:53 +0200 Subject: [PATCH 092/130] DRY tab separated value string parsing --- .../java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java | 3 ++- .../mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java | 2 +- .../cbio/portal/scripts/ImportCnaDiscreteLongData.java | 4 ++-- .../org/mskcc/cbio/portal/scripts/ImportCosmicData.java | 2 +- .../cbio/portal/scripts/ImportExtendedMutationData.java | 3 ++- .../java/org/mskcc/cbio/portal/scripts/ImportGeneData.java | 2 +- .../portal/scripts/ImportGenericAssayPatientLevelData.java | 2 +- .../cbio/portal/scripts/ImportStructuralVariantData.java | 2 +- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 6 +++--- .../mskcc/cbio/portal/scripts/ImportTypesOfCancers.java | 3 ++- src/main/java/org/mskcc/cbio/portal/util/FileUtil.java | 7 +++++++ .../org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java | 2 +- 12 files changed, 24 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java index 9b7478ad..b5f21abd 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java @@ -47,6 +47,7 @@ import java.util.regex.Pattern; import org.mskcc.cbio.portal.model.CanonicalGene; import org.mskcc.cbio.portal.util.DataValidator; +import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; /** @@ -91,7 +92,7 @@ private synchronized void fillCache() { if (line.startsWith("#")) { continue; } - String[] parts = line.trim().split("\t",-1); + String[] parts = FileUtil.splitTsvLine(line); CanonicalGene gene = getGene(Long.parseLong(parts[1])); if (gene==null) { ProgressMonitor.logWarning(line+" in config file [resources" + GENE_SYMBOL_DISAMBIGUATION_FILE + diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java b/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java index 470a06e9..997c74c9 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java @@ -68,7 +68,7 @@ public void convert() throws IOException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); if (!line.startsWith("#")) { - String parts[] = line.split("\t",-1); + String parts[] = FileUtil.splitTsvLine(line); if (parts.length<8) { System.err.println("Wrong line in cosmic: "+line); continue; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 1c6cf96a..66865316 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -130,7 +130,7 @@ private void doImportData() throws Exception { // Pass first line with headers to util: String line = buf.readLine(); int lineIndex = 1; - String[] headerParts = line.split("\t", -1); + String[] headerParts = FileUtil.splitTsvLine(line); this.cnaUtil = new CnaUtil(headerParts, this.namespaces); boolean isDiscretizedCnaProfile = geneticProfile != null @@ -187,7 +187,7 @@ public void extractDataToImport( if (!FileUtil.isInfoLine(line)) { return; } - String[] lineParts = line.split("\t", -1); + String[] lineParts = FileUtil.splitTsvLine(line); CanonicalGene gene = this.getGene(cnaUtil.getEntrezSymbol(lineParts), lineParts, cnaUtil); importContainer.genes.add(gene); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java index 3cb549c8..302ad44d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java @@ -80,7 +80,7 @@ public void importData() throws IOException, DaoException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); if (!line.startsWith("#")) { - String parts[] = line.split("\t",-1); + String parts[] = FileUtil.splitTsvLine(line); if (parts.length<8) { System.err.println("Wrong line in cosmic: "+line); continue; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 79f025a4..f7035f01 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -57,6 +57,7 @@ import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.ExtendedMutationUtil; +import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.GeneticProfileUtil; import org.mskcc.cbio.portal.util.GlobalProperties; import org.mskcc.cbio.portal.util.ProgressMonitor; @@ -194,7 +195,7 @@ public void importData() throws IOException, DaoException { if( !line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1 ); // the -1 keeps trailing empty strings; see JavaDoc for String + String[] parts = FileUtil.splitTsvLine(line); MafRecord record = mafUtil.parseRecord(line); if (!record.getNcbiBuild().equalsIgnoreCase(genomeBuildName)) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java index 63032aee..832424ae 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java @@ -204,7 +204,7 @@ public static void importHGNCData(File geneFile, String genomeBuild) throws IOEx while ((line = buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - String parts[] = line.split("\t", -1); // include trailing empty strings + String parts[] = FileUtil.splitTsvLine(line); // include trailing empty strings if (!DataValidator.isValidNumericSequence(parts[0])) { ProgressMonitor.logWarning("Skipping gene with invalid entrez gene id '" + parts[1] + "'"); continue; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index dddcc156..569588fb 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -191,7 +191,7 @@ private boolean parseGenericAssayLine(String line, int nrColumns, int patientSta boolean recordIsStored = false; if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); + String[] parts = FileUtil.splitTsvLine(line); if (parts.length > nrColumns) { if (line.split("\t").length > nrColumns) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index c72d1f3a..6b8f606c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -86,7 +86,7 @@ public void importData() throws IOException, DaoException { ConsoleUtil.showProgress(); if( !line.startsWith("#") && line.trim().length() > 0) { recordCount++; - String parts[] = line.split("\t", -1); + String parts[] = FileUtil.splitTsvLine(line); StructuralVariant structuralVariant = structuralVariantUtil.parseStructuralVariantRecord(parts); structuralVariant.setInternalId(++id); structuralVariant.setGeneticProfileId(geneticProfileId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 93166271..eba64c4f 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -206,7 +206,7 @@ private void doImportData() throws IOException, DaoException { FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); String headerLine = buf.readLine(); - String headerParts[] = headerLine.split("\t"); + String[] headerParts = FileUtil.splitTsvLine(headerLine); //Whether data regards CNA or RPPA: boolean isDiscretizedCnaProfile = geneticProfile != null @@ -328,7 +328,7 @@ private void doImportData() throws IOException, DaoException { boolean recordAdded = false; if (FileUtil.isInfoLine(line)) { - String[] rowParts = line.split("\t", -1); + String[] rowParts = FileUtil.splitTsvLine(line); if (rowParts.length > headerColumns && line.split("\t").length > headerColumns) { ProgressMonitor.logWarning("Ignoring line with more fields (" + rowParts.length @@ -468,7 +468,7 @@ private Map, Map> readPdAnnotations(File String line = reader.readLine(); while (line != null) { - String[] row = line.split("\t", -1); + String[] row = FileUtil.splitTsvLine(line); if (row.length < 6) { throw new RuntimeException("Mis-formatted row: " + String.join(", ", row)); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java index 4fb17193..977b3f9f 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java @@ -38,6 +38,7 @@ import org.mskcc.cbio.portal.dao.DaoTypeOfCancer; import org.mskcc.cbio.portal.model.TypeOfCancer; import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; /** @@ -89,7 +90,7 @@ private static List parseCancerTypesFromFile(File file) throws IOE Scanner scanner = new Scanner(file); while (scanner.hasNextLine()) { String nextLine = scanner.nextLine(); - String[] fields = nextLine.split("\t", -1); + String[] fields = FileUtil.splitTsvLine(nextLine); throwExceptionIfColumnCountIsWrong(file, nextLine, fields, EXPECTED_DATAFILE_COLUMN_COUNT); TypeOfCancer typeOfCancer = new TypeOfCancer(); String typeOfCancerId = fields[0].trim(); diff --git a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java index 2e767618..5506931a 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java @@ -32,6 +32,8 @@ package org.mskcc.cbio.portal.util; +import org.jetbrains.annotations.NotNull; + import java.io.BufferedReader; import java.io.File; import java.io.FileReader; @@ -75,4 +77,9 @@ public static boolean isInfoLine(String line) { return !line.startsWith("#") && line.trim().length() > 0; } + @NotNull + public static String[] splitTsvLine(String line) { + return line.split("\t", -1); + } + } \ No newline at end of file diff --git a/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java b/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java index e1c035e1..431c963c 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java @@ -107,7 +107,7 @@ private static void setMyCancerGenomeLinkFromLocal() { while ((line=in.readLine())!=null && line.startsWith("#")) {} for (; line!=null; line=in.readLine()) { - String[] parts = line.trim().split("\t",-1); + String[] parts = FileUtil.splitTsvLine(line); if (parts.length<4) { continue; } From 4e8a7c27edf8cdb91ec4e1ae5885102156c5fef1 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 11:10:19 +0200 Subject: [PATCH 093/130] Reuse FileUtil.isInfoLine(String line) throughout the code --- .../java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java | 2 +- src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java | 2 +- .../mskcc/cbio/portal/scripts/ImportExtendedMutationData.java | 2 +- .../portal/scripts/ImportGenericAssayPatientLevelData.java | 2 +- .../mskcc/cbio/portal/scripts/ImportStructuralVariantData.java | 2 +- src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java | 3 ++- src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java | 2 +- 7 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java b/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java index df1a1ce3..265e1475 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java @@ -109,7 +109,7 @@ private HashSet getExcludedCases() throws IOException { HashSet excludedCaseSet = new HashSet(); while (line != null) { - if (!line.startsWith("#") && line.trim().length() > 0) { + if (FileUtil.isInfoLine(line)) { String parts[] = line.split("\t"); excludedCaseSet.add(parts[0]); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java index 17e03c2e..6f1fb52c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java @@ -64,7 +64,7 @@ public void importData() throws IOException, DaoException { while (line != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if (!line.startsWith("#") && line.trim().length() > 0) { + if (FileUtil.isInfoLine(line)) { line = line.trim(); String parts[] = line.split("\t"); String geneSymbol = parts[0]; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index f7035f01..edac6f41 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -193,7 +193,7 @@ public void importData() throws IOException, DaoException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if( !line.startsWith("#") && line.trim().length() > 0) + if(FileUtil.isInfoLine(line)) { String[] parts = FileUtil.splitTsvLine(line); MafRecord record = mafUtil.parseRecord(line); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index 569588fb..b5018fb8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -190,7 +190,7 @@ private boolean parseGenericAssayLine(String line, int nrColumns, int patientSta boolean recordIsStored = false; - if (!line.startsWith("#") && line.trim().length() > 0) { + if (FileUtil.isInfoLine(line)) { String[] parts = FileUtil.splitTsvLine(line); if (parts.length > nrColumns) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index 6b8f606c..9c8199eb 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -84,7 +84,7 @@ public void importData() throws IOException, DaoException { while ((line = buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if( !line.startsWith("#") && line.trim().length() > 0) { + if(FileUtil.isInfoLine(line)) { recordCount++; String parts[] = FileUtil.splitTsvLine(line); StructuralVariant structuralVariant = structuralVariantUtil.parseStructuralVariantRecord(parts); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java index 7a3ed177..28a83f58 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java @@ -39,6 +39,7 @@ import org.mskcc.cbio.portal.model.User; import org.mskcc.cbio.portal.model.UserAuthorities; import org.mskcc.cbio.portal.util.ConsoleUtil; +import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import java.io.BufferedReader; @@ -77,7 +78,7 @@ public static void main(String[] args) throws Exception { while (line != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if (!line.startsWith("#") && line.trim().length() > 0) { + if (FileUtil.isInfoLine(line)) { try { addUser(line); count++; diff --git a/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java b/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java index b7ef75e0..837e1fa1 100644 --- a/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java +++ b/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java @@ -273,7 +273,7 @@ public void testResolveAscnAnnotationNamespace() throws Exception { List ascnRecords = new ArrayList<>(); while((line=buf.readLine()) != null) { - if (!line.startsWith("#") && line.trim().length() > 0) { + if (FileUtil.isInfoLine(line)) { MafRecord record = mafUtil.parseRecord(line); // every record in test MAF should have ASCN data Assert.assertTrue(record.getNamespacesMap().containsKey(ASCN_NAMESPACE)); From b93e7413746972653687fe397fd45f4d69775bb8 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 14:24:27 +0200 Subject: [PATCH 094/130] Extract ensuring header and row match to tsv utility class --- .../cbio/portal/dao/DaoGeneOptimized.java | 4 +- .../portal/scripts/ConvertCosmicVcfToMaf.java | 3 +- .../cbio/portal/scripts/CutInvalidCases.java | 3 +- .../GeneticAlterationIncrementalImporter.java | 3 - .../scripts/ImportCnaDiscreteLongData.java | 8 +- .../cbio/portal/scripts/ImportCosmicData.java | 3 +- .../cbio/portal/scripts/ImportDrugs.java | 3 +- .../scripts/ImportExtendedMutationData.java | 6 +- .../cbio/portal/scripts/ImportGeneData.java | 3 +- .../scripts/ImportGenericAssayEntity.java | 6 +- .../ImportGenericAssayPatientLevelData.java | 4 +- .../scripts/ImportStructuralVariantData.java | 5 +- .../portal/scripts/ImportTabDelimData.java | 98 +++++++++---------- .../portal/scripts/ImportTypesOfCancers.java | 4 +- .../cbio/portal/scripts/ImportUsers.java | 4 +- .../org/mskcc/cbio/portal/util/FileUtil.java | 19 +--- .../portal/util/MyCancerGenomeLinkUtil.java | 15 +-- .../org/mskcc/cbio/portal/util/TsvUtil.java | 43 ++++++++ .../cbioportal/model/util/TsvUtilTest.java | 29 ++++++ .../GeneticAlterationsTestHelper.java | 2 - .../mskcc/cbio/portal/util/TestMafUtil.java | 9 +- src/test/resources/data_CNA_sample.txt | 4 +- .../data_expression_Zscores.txt | 7 +- 23 files changed, 155 insertions(+), 130 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java create mode 100644 src/test/java/org/cbioportal/model/util/TsvUtilTest.java diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java index b5f21abd..d625e724 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java @@ -47,8 +47,8 @@ import java.util.regex.Pattern; import org.mskcc.cbio.portal.model.CanonicalGene; import org.mskcc.cbio.portal.util.DataValidator; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; /** * A Utility Class that speeds access to Gene Info. @@ -92,7 +92,7 @@ private synchronized void fillCache() { if (line.startsWith("#")) { continue; } - String[] parts = FileUtil.splitTsvLine(line); + String[] parts = TsvUtil.splitTsvLine(line); CanonicalGene gene = getGene(Long.parseLong(parts[1])); if (gene==null) { ProgressMonitor.logWarning(line+" in config file [resources" + GENE_SYMBOL_DISAMBIGUATION_FILE + diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java b/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java index 997c74c9..a36fffa4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ConvertCosmicVcfToMaf.java @@ -35,6 +35,7 @@ import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.BufferedWriter; @@ -68,7 +69,7 @@ public void convert() throws IOException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); if (!line.startsWith("#")) { - String parts[] = FileUtil.splitTsvLine(line); + String parts[] = TsvUtil.splitTsvLine(line); if (parts.length<8) { System.err.println("Wrong line in cosmic: "+line); continue; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java b/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java index 265e1475..686fccc8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java @@ -36,6 +36,7 @@ import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.dao.*; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.File; import java.io.IOException; @@ -109,7 +110,7 @@ private HashSet getExcludedCases() throws IOException { HashSet excludedCaseSet = new HashSet(); while (line != null) { - if (FileUtil.isInfoLine(line)) { + if (TsvUtil.isInfoLine(line)) { String parts[] = line.split("\t"); excludedCaseSet.add(parts[0]); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index 653dacb1..cdca8413 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -1,6 +1,5 @@ package org.mskcc.cbio.portal.scripts; -import org.jetbrains.annotations.NotNull; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; @@ -70,7 +69,6 @@ public void finalise() { geneticAlterationImporter.finalise(); } - @NotNull private String[] extendValues(int geneticEntityId, String[] values) { Map sampleIdToValue = mapWithFileOrderedSampleList(values); String[] updatedSampleValues = new String[extendedOrderedSampleList.size()]; @@ -91,7 +89,6 @@ private String[] extendValues(int geneticEntityId, String[] values) { return updatedSampleValues; } - @NotNull private Map mapWithFileOrderedSampleList(String[] values) { return ArrayUtil.zip(fileOrderedSampleList.toArray(new Integer[0]), values); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 66865316..c27b67a2 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -40,10 +40,10 @@ import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.util.CnaUtil; import org.mskcc.cbio.portal.util.ConsoleUtil; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.GeneticProfileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.File; @@ -130,7 +130,7 @@ private void doImportData() throws Exception { // Pass first line with headers to util: String line = buf.readLine(); int lineIndex = 1; - String[] headerParts = FileUtil.splitTsvLine(line); + String[] headerParts = TsvUtil.splitTsvLine(line); this.cnaUtil = new CnaUtil(headerParts, this.namespaces); boolean isDiscretizedCnaProfile = geneticProfile != null @@ -184,10 +184,10 @@ public void extractDataToImport( int lineIndex, CnaImportData importContainer ) throws Exception { - if (!FileUtil.isInfoLine(line)) { + if (!TsvUtil.isInfoLine(line)) { return; } - String[] lineParts = FileUtil.splitTsvLine(line); + String[] lineParts = TsvUtil.splitTsvLine(line); CanonicalGene gene = this.getGene(cnaUtil.getEntrezSymbol(lineParts), lineParts, cnaUtil); importContainer.genes.add(gene); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java index 302ad44d..25eb5e1e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCosmicData.java @@ -42,6 +42,7 @@ import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.MutationKeywordUtils; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; import org.springframework.util.Assert; import java.io.BufferedReader; @@ -80,7 +81,7 @@ public void importData() throws IOException, DaoException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); if (!line.startsWith("#")) { - String parts[] = FileUtil.splitTsvLine(line); + String parts[] = TsvUtil.splitTsvLine(line); if (parts.length<8) { System.err.println("Wrong line in cosmic: "+line); continue; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java index 6f1fb52c..dab65f56 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java @@ -38,6 +38,7 @@ import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.File; @@ -64,7 +65,7 @@ public void importData() throws IOException, DaoException { while (line != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if (FileUtil.isInfoLine(line)) { + if (TsvUtil.isInfoLine(line)) { line = line.trim(); String parts[] = line.split("\t"); String geneSymbol = parts[0]; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index edac6f41..5a92d2e2 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -57,11 +57,11 @@ import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.util.ConsoleUtil; import org.mskcc.cbio.portal.util.ExtendedMutationUtil; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.GeneticProfileUtil; import org.mskcc.cbio.portal.util.GlobalProperties; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.File; @@ -193,9 +193,9 @@ public void importData() throws IOException, DaoException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if(FileUtil.isInfoLine(line)) + if(TsvUtil.isInfoLine(line)) { - String[] parts = FileUtil.splitTsvLine(line); + String[] parts = TsvUtil.splitTsvLine(line); MafRecord record = mafUtil.parseRecord(line); if (!record.getNcbiBuild().equalsIgnoreCase(genomeBuildName)) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java index 832424ae..d640bc3c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java @@ -49,6 +49,7 @@ import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.GlobalProperties; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.File; @@ -204,7 +205,7 @@ public static void importHGNCData(File geneFile, String genomeBuild) throws IOEx while ((line = buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - String parts[] = FileUtil.splitTsvLine(line); // include trailing empty strings + String parts[] = TsvUtil.splitTsvLine(line); // include trailing empty strings if (!DataValidator.isValidNumericSequence(parts[0])) { ProgressMonitor.logWarning("Skipping gene with invalid entrez gene id '" + parts[1] + "'"); continue; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java index 7da2e983..bc91f60d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java @@ -40,11 +40,9 @@ import java.io.File; import java.io.FileReader; import java.util.ArrayList; -import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.stream.Collectors; import org.cbioportal.model.EntityType; import org.cbioportal.model.GenericEntityProperty; @@ -52,12 +50,12 @@ import org.mskcc.cbio.portal.dao.DaoGenericAssay; import org.mskcc.cbio.portal.dao.DaoGeneticEntity; import org.mskcc.cbio.portal.model.GeneticAlterationType; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import joptsimple.OptionParser; import joptsimple.OptionSet; import joptsimple.OptionSpec; +import org.mskcc.cbio.portal.util.TsvUtil; /** * Note; Imports genetic entities from generic assay files. Has been written for treatment response data @@ -188,7 +186,7 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera currentLine = buf.readLine(); while (currentLine != null) { - if (!FileUtil.isInfoLine(currentLine)) { + if (!TsvUtil.isInfoLine(currentLine)) { currentLine = buf.readLine(); continue; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index b5018fb8..6bb8b211 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -190,8 +190,8 @@ private boolean parseGenericAssayLine(String line, int nrColumns, int patientSta boolean recordIsStored = false; - if (FileUtil.isInfoLine(line)) { - String[] parts = FileUtil.splitTsvLine(line); + if (TsvUtil.isInfoLine(line)) { + String[] parts = TsvUtil.splitTsvLine(line); if (parts.length > nrColumns) { if (line.split("\t").length > nrColumns) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index 9c8199eb..64e6de77 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -34,7 +34,6 @@ import java.io.File; import java.io.FileReader; import java.io.IOException; -import java.util.ArrayList; import java.util.HashSet; import java.util.Set; @@ -84,9 +83,9 @@ public void importData() throws IOException, DaoException { while ((line = buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if(FileUtil.isInfoLine(line)) { + if(TsvUtil.isInfoLine(line)) { recordCount++; - String parts[] = FileUtil.splitTsvLine(line); + String parts[] = TsvUtil.splitTsvLine(line); StructuralVariant structuralVariant = structuralVariantUtil.parseStructuralVariantRecord(parts); structuralVariant.setInternalId(++id); structuralVariant.setGeneticProfileId(geneticProfileId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index eba64c4f..1a8a9df1 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -55,6 +55,7 @@ import org.mskcc.cbio.portal.util.GeneticProfileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.mskcc.cbio.portal.util.StableIdUtil; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.File; @@ -206,7 +207,7 @@ private void doImportData() throws IOException, DaoException { FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); String headerLine = buf.readLine(); - String[] headerParts = FileUtil.splitTsvLine(headerLine); + String[] headerParts = TsvUtil.splitTsvLine(headerLine); //Whether data regards CNA or RPPA: boolean isDiscretizedCnaProfile = geneticProfile != null @@ -318,7 +319,6 @@ private void doImportData() throws IOException, DaoException { genericAssayStableIdToEntityIdMap = GenericAssayMetaUtils.buildGenericAssayStableIdToEntityIdMap(); } - int headerColumns = headerParts.length; String line = buf.readLine(); while (line != null) { @@ -327,62 +327,54 @@ private void doImportData() throws IOException, DaoException { ConsoleUtil.showProgress(); boolean recordAdded = false; - if (FileUtil.isInfoLine(line)) { - String[] rowParts = FileUtil.splitTsvLine(line); + if (TsvUtil.isInfoLine(line)) { + String[] rowParts = TsvUtil.splitTsvLine(line); - if (rowParts.length > headerColumns && line.split("\t").length > headerColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + rowParts.length - + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); - } else if (rowParts.length < headerColumns) { - ProgressMonitor.logWarning("Ignoring line with less fields (" + rowParts.length - + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); + TsvUtil.ensureHeaderAndRowMatch(headerParts, rowParts); + String[] sampleValues = ArrayUtils.subarray(rowParts, sampleStartIndex, rowParts.length); + + // trim whitespace from values + sampleValues = Stream.of(sampleValues).map(String::trim).toArray(String[]::new); + sampleValues = filterOutNormalValues(filteredSampleIndices, sampleValues); + + // either parse line as geneset or gene for importing into 'genetic_alteration' table + if (isGsvaProfile) { + String genesetId = rowParts[genesetIdIndex]; + recordAdded = saveGenesetLine(sampleValues, genesetId); + } else if (isGenericAssayProfile) { + String genericAssayId = rowParts[genericAssayIdIndex]; + recordAdded = saveGenericAssayLine(sampleValues, genericAssayId, genericAssayStableIdToEntityIdMap); } else { - String sampleValues[] = ArrayUtils.subarray(rowParts, sampleStartIndex, rowParts.length > headerColumns ? headerColumns : rowParts.length); - - // trim whitespace from values - sampleValues = Stream.of(sampleValues).map(String::trim).toArray(String[]::new); - sampleValues = filterOutNormalValues(filteredSampleIndices, sampleValues); - - // either parse line as geneset or gene for importing into 'genetic_alteration' table - if (isGsvaProfile) { - String genesetId = rowParts[genesetIdIndex]; - recordAdded = saveGenesetLine(sampleValues, genesetId); - } else if (isGenericAssayProfile) { - String genericAssayId = rowParts[genericAssayIdIndex]; - recordAdded = saveGenericAssayLine(sampleValues, genericAssayId, genericAssayStableIdToEntityIdMap); + String geneSymbol = null; + if (hugoSymbolIndex != -1) { + geneSymbol = rowParts[hugoSymbolIndex]; + } + if (rppaGeneRefIndex != -1) { + geneSymbol = rowParts[rppaGeneRefIndex]; + } + if (geneSymbol != null && geneSymbol.isEmpty()) { + geneSymbol = null; + } + //get entrez + String entrez = null; + if (entrezGeneIdIndex != -1) { + entrez = rowParts[entrezGeneIdIndex]; + } + if (entrez != null && entrez.isEmpty()) { + entrez = null; + } + if (entrez != null && !DataValidator.isValidNumericSequence(entrez)) { + ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); } else { - String geneSymbol = null; - if (hugoSymbolIndex != -1) { - geneSymbol = rowParts[hugoSymbolIndex]; - } - if (rppaGeneRefIndex != -1) { - geneSymbol = rowParts[rppaGeneRefIndex]; - } - if (geneSymbol != null && geneSymbol.isEmpty()) { - geneSymbol = null; - } - //get entrez - String entrez = null; - if (entrezGeneIdIndex != -1) { - entrez = rowParts[entrezGeneIdIndex]; - } - if (entrez != null && entrez.isEmpty()) { - entrez = null; - } - if (entrez != null && !DataValidator.isValidNumericSequence(entrez)) { - ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); - } else { - String firstCellValue = rowParts[0]; - if (targetLine == null || firstCellValue.equals(targetLine)) { - recordAdded = saveLine(sampleValues, - entrez, geneSymbol, - isRppaProfile, isDiscretizedCnaProfile, - existingCnaEvents); - } + String firstCellValue = rowParts[0]; + if (targetLine == null || firstCellValue.equals(targetLine)) { + recordAdded = saveLine(sampleValues, + entrez, geneSymbol, + isRppaProfile, isDiscretizedCnaProfile, + existingCnaEvents); } } } - } // increment number of records added or entries skipped @@ -468,7 +460,7 @@ private Map, Map> readPdAnnotations(File String line = reader.readLine(); while (line != null) { - String[] row = FileUtil.splitTsvLine(line); + String[] row = TsvUtil.splitTsvLine(line); if (row.length < 6) { throw new RuntimeException("Mis-formatted row: " + String.join(", ", row)); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java index 977b3f9f..10bf9159 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTypesOfCancers.java @@ -38,8 +38,8 @@ import org.mskcc.cbio.portal.dao.DaoTypeOfCancer; import org.mskcc.cbio.portal.model.TypeOfCancer; import org.mskcc.cbio.portal.util.ConsoleUtil; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; /** * Load all the types of cancer and their names from a file. @@ -90,7 +90,7 @@ private static List parseCancerTypesFromFile(File file) throws IOE Scanner scanner = new Scanner(file); while (scanner.hasNextLine()) { String nextLine = scanner.nextLine(); - String[] fields = FileUtil.splitTsvLine(nextLine); + String[] fields = TsvUtil.splitTsvLine(nextLine); throwExceptionIfColumnCountIsWrong(file, nextLine, fields, EXPECTED_DATAFILE_COLUMN_COUNT); TypeOfCancer typeOfCancer = new TypeOfCancer(); String typeOfCancerId = fields[0].trim(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java index 28a83f58..b76ec3cb 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java @@ -39,8 +39,8 @@ import org.mskcc.cbio.portal.model.User; import org.mskcc.cbio.portal.model.UserAuthorities; import org.mskcc.cbio.portal.util.ConsoleUtil; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; +import org.mskcc.cbio.portal.util.TsvUtil; import java.io.BufferedReader; import java.io.File; @@ -78,7 +78,7 @@ public static void main(String[] args) throws Exception { while (line != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if (FileUtil.isInfoLine(line)) { + if (TsvUtil.isInfoLine(line)) { try { addUser(line); count++; diff --git a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java index 5506931a..75bf743e 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java @@ -32,8 +32,6 @@ package org.mskcc.cbio.portal.util; -import org.jetbrains.annotations.NotNull; - import java.io.BufferedReader; import java.io.File; import java.io.FileReader; @@ -58,7 +56,7 @@ public static int getNumLines(File file) throws IOException { try (FileReader reader = new FileReader(file); BufferedReader buffered = new BufferedReader(reader)) { String line = buffered.readLine(); while (line != null) { - if (isInfoLine(line)) { + if (TsvUtil.isInfoLine(line)) { numLines++; } line = buffered.readLine(); @@ -67,19 +65,4 @@ public static int getNumLines(File file) throws IOException { } } - /** - * Does line brings any information? - * e.g. blank like and comments do not - * @param line - * @return - */ - public static boolean isInfoLine(String line) { - return !line.startsWith("#") && line.trim().length() > 0; - } - - @NotNull - public static String[] splitTsvLine(String line) { - return line.split("\t", -1); - } - } \ No newline at end of file diff --git a/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java b/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java index 431c963c..a5244050 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/MyCancerGenomeLinkUtil.java @@ -33,23 +33,12 @@ import java.io.BufferedReader; import java.io.IOException; -import java.io.InputStream; import java.io.InputStreamReader; -import java.net.URL; import java.util.ArrayList; -import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; -import java.util.TreeMap; -import java.util.logging.Level; -import java.util.logging.Logger; -import java.util.regex.Matcher; -import java.util.regex.Pattern; -import javax.net.ssl.HttpsURLConnection; -import org.apache.commons.text.StringEscapeUtils; + import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.model.CanonicalGene; @@ -107,7 +96,7 @@ private static void setMyCancerGenomeLinkFromLocal() { while ((line=in.readLine())!=null && line.startsWith("#")) {} for (; line!=null; line=in.readLine()) { - String[] parts = FileUtil.splitTsvLine(line); + String[] parts = TsvUtil.splitTsvLine(line); if (parts.length<4) { continue; } diff --git a/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java new file mode 100644 index 00000000..5402d560 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java @@ -0,0 +1,43 @@ +package org.mskcc.cbio.portal.util; + +/** + * Utils to parse and validate TSV lines + * @author Ruslan Forostianov + */ +public class TsvUtil { + /** + * Does line brings any information? + * e.g. blank like and comments do not + * @param line + * @return + */ + public static boolean isInfoLine(String line) { + return !line.startsWith("#") && line.trim().length() > 0; + } + + /** + * Splits tsv line and does not trim empty values at the end. + * @param line + * @return + */ + public static String[] splitTsvLine(String line) { + return line.split("\t", -1); + } + + /** + * Makes sure header and row length match + * @param headerParts + * @param rowParts + */ + public static void ensureHeaderAndRowMatch(String[] headerParts, String[] rowParts) { + int headerColumns = headerParts.length; + if (rowParts.length > headerColumns) { + throw new IllegalArgumentException("Found line with more fields (" + rowParts.length + + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); + } + if (rowParts.length < headerColumns) { + throw new IllegalArgumentException("Found line with less fields (" + rowParts.length + + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); + } + } +} diff --git a/src/test/java/org/cbioportal/model/util/TsvUtilTest.java b/src/test/java/org/cbioportal/model/util/TsvUtilTest.java new file mode 100644 index 00000000..c49de40b --- /dev/null +++ b/src/test/java/org/cbioportal/model/util/TsvUtilTest.java @@ -0,0 +1,29 @@ +package org.cbioportal.model.util; + +import org.junit.Test; + +import static org.junit.Assert.assertTrue; +import static org.mskcc.cbio.portal.util.TsvUtil.ensureHeaderAndRowMatch; +import static org.junit.Assert.assertThrows; + +public class TsvUtilTest { + + @Test + public void testEnsureHeaderAndRowMatch_headerHasGreaterLength() { + IllegalArgumentException illegalArgumentException = assertThrows(IllegalArgumentException.class, + () -> ensureHeaderAndRowMatch(new String[] {"header1", "header2"}, new String[] {"row1"})); + assertTrue(illegalArgumentException.getMessage().contains("Found line with less fields")); + } + + @Test + public void testEnsureHeaderAndRowMatch_headerHasSmallerLength() { + IllegalArgumentException illegalArgumentException = assertThrows(IllegalArgumentException.class, + () -> ensureHeaderAndRowMatch(new String[] {"header1"}, new String[] {"row1", "row2"})); + assertTrue(illegalArgumentException.getMessage().contains("Found line with more fields")); + } + + @Test + public void testEnsureHeaderAndRowMatch_headerHasSameLength() { + ensureHeaderAndRowMatch(new String[] {"header1", "header2"}, new String[] {"row1", "row2"}); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java index fdf36995..48ca7b4e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java @@ -1,6 +1,5 @@ package org.mskcc.cbio.portal.integrationTest.incremental; -import org.jetbrains.annotations.NotNull; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneticEntity; @@ -12,7 +11,6 @@ import static org.junit.Assert.assertTrue; public class GeneticAlterationsTestHelper { - @NotNull public static Set geneStableIdsToEntityIds(Set beforeStableIds) { return beforeStableIds.stream().map(stableId -> { try { diff --git a/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java b/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java index 837e1fa1..3577af14 100644 --- a/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java +++ b/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java @@ -34,20 +34,17 @@ import java.io.BufferedReader; import java.io.FileReader; -import java.io.IOException; + import org.mskcc.cbio.portal.model.ExtendedMutation; import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent; import java.util.*; -import java.util.regex.Matcher; + import org.junit.Assert; import org.junit.Test; import org.mskcc.cbio.maf.MafRecord; import org.mskcc.cbio.maf.MafUtil; -import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.AlleleSpecificCopyNumber; -import org.mskcc.cbio.portal.model.GeneticProfile; /** * @@ -273,7 +270,7 @@ public void testResolveAscnAnnotationNamespace() throws Exception { List ascnRecords = new ArrayList<>(); while((line=buf.readLine()) != null) { - if (FileUtil.isInfoLine(line)) { + if (TsvUtil.isInfoLine(line)) { MafRecord record = mafUtil.parseRecord(line); // every record in test MAF should have ASCN data Assert.assertTrue(record.getNamespacesMap().containsKey(ASCN_NAMESPACE)); diff --git a/src/test/resources/data_CNA_sample.txt b/src/test/resources/data_CNA_sample.txt index f225740b..ec8b13fc 100644 --- a/src/test/resources/data_CNA_sample.txt +++ b/src/test/resources/data_CNA_sample.txt @@ -1,3 +1,3 @@ -GeneId Hugo_Symbol TCGA-02-0001-01 TCGA-02-0003-01 TCGA-02-0004-01 TCGA-02-0006-01 +GeneId Hugo_Symbol TCGA-02-0001-01 TCGA-02-0003-01 TCGA-02-0004-01 TCGA-02-0006-01 999999672 TESTBRCA1 -2 0 1 0 -999999675 TESTBRCA2 0 2 0 -1 \ No newline at end of file +999999675 TESTBRCA2 0 2 0 -1 diff --git a/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt b/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt index dc189cec..a96eabd7 100644 --- a/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt +++ b/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt @@ -15,15 +15,10 @@ BRCA1 0.6393 0.1 0.5377 BRAF 673 0.785 0.1 0.0426 # Duplicate lines should be ignored BRAF 673 0.7851 0.1 0.0427 -# Although this row has 2 extra columns, we are ok with that as they contain blank values -BRCA2 675 1.0741 0.1 0.718 +BRCA2 675 1.0741 0.1 0.718 # This gene is new! the empty values should be set for the already existing samples in the database CDK1 983 -0.1735 0.1 -0.6412 # These lines have to be skipped -# One column too much -FGFR3 2261 0.045 0.1 0.675 0.0224575 -# No sample columns -PIEZO1 9780 # invalid entrez id P2RY10 -1 0.741 0.1 0.685 # Multigene sign From 9089e77f277211c0ca1aeaa81ddb284cddeeadfd Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 14:38:35 +0200 Subject: [PATCH 095/130] Simplify delete sql. Rely on cascade delete instead. --- .../java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java index d5045721..714769bd 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java @@ -211,10 +211,7 @@ public static void deleteByPatientId(int patientId) throws DaoException { try { con = JdbcUtil.getDbConnection(DaoClinicalEvent.class); - pstmt = con.prepareStatement("DELETE clinical_event, clinical_event_data" + - " FROM clinical_event" + - " LEFT JOIN clinical_event_data ON clinical_event_data.CLINICAL_EVENT_ID = clinical_event.CLINICAL_EVENT_ID" + - " WHERE clinical_event.PATIENT_ID = ?"); + pstmt = con.prepareStatement("DELETE FROM clinical_event WHERE clinical_event.PATIENT_ID = ?"); pstmt.setInt(1, patientId); pstmt.executeUpdate(); } catch (SQLException e) { From 16f62955e5e4c1e26017d8290fac6c299dd90e19 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 14:41:00 +0200 Subject: [PATCH 096/130] Generalise overwrite-existing flag description to make it more accurate --- .../mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java | 2 +- src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java index 06a52da6..ada71ec8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java @@ -87,7 +87,7 @@ public void run() { // supported by the uploader already. Added for uniformity, to do not cause error when upstream software uses this flag parser.accepts("overwrite-existing", - "Enables re-uploading molecular data that already exist for the given profile and sample.") + "Enables overwriting data if it turns out it already exists in DB.") .withOptionalArg().describedAs("overwrite-existing").ofType(String.class); OptionSet options; try { diff --git a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java index f53c7ba2..0d2b6a23 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java @@ -139,7 +139,7 @@ public static OptionSet parseStandardDataAndMetaOptions(String[] args, String de .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); } parser.accepts("overwrite-existing", - "Enables re-uploading molecular data that already exist for the given profile and sample.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); + "Enables overwriting data if it turns out it already exists in DB.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); String progName = "importScript"; OptionSet options = null; @@ -250,7 +250,7 @@ public static OptionSet parseStandardDataAndMetaUpdateOptions(String[] args, Str .withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class ); } parser.accepts("overwrite-existing", - "Enables re-uploading molecular data that already exist for the given profile and sample.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); + "Enables overwriting data if it turns out it already exists in DB.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class); String progName = "importScript"; From 79c404129862cf39800091a6046960e9a680f101 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 14:55:04 +0200 Subject: [PATCH 097/130] Rename updateMode to isIncrementalUpdateMode flag --- .../scripts/ImportCnaDiscreteLongData.java | 12 ++++---- .../scripts/ImportCopyNumberSegmentData.java | 12 ++++---- .../scripts/ImportStructuralVariantData.java | 8 +++--- .../portal/scripts/ImportTabDelimData.java | 28 +++++++++---------- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index c27b67a2..781bc017 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -71,7 +71,7 @@ public class ImportCnaDiscreteLongData { private int samplesSkipped = 0; private Set namespaces; - private boolean updateMode; + private boolean isIncrementalUpdateMode; private GeneticProfile geneticProfile; @@ -85,7 +85,7 @@ public ImportCnaDiscreteLongData( String genePanel, DaoGeneOptimized daoGene, Set namespaces, - boolean updateMode + boolean isIncrementalUpdateMode ) { this.namespaces = namespaces; this.cnaFile = cnaFile; @@ -99,7 +99,7 @@ public ImportCnaDiscreteLongData( } this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); this.daoGene = daoGene; - this.updateMode = updateMode; + this.isIncrementalUpdateMode = isIncrementalUpdateMode; } public ImportCnaDiscreteLongData( @@ -152,7 +152,7 @@ private void doImportData() throws Exception { } orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); - this.geneticAlterationGeneImporter = updateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) + this.geneticAlterationGeneImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) : new GeneticAlterationImporterImpl(geneticProfileId, orderedSampleList); for (Long entrezId : toImport.eventsTable.rowKeySet()) { @@ -224,7 +224,7 @@ public void extractDataToImport( } private void ensureSampleProfileExists(Sample sample) throws DaoException { - if (updateMode) { + if (isIncrementalUpdateMode) { upsertSampleProfile(sample); } else { createSampleProfileIfNotExists(sample); @@ -246,7 +246,7 @@ private void storeCnaEvents(CnaImportData toImport, Long entrezId) throws DaoExc .filter(v -> v.cnaEvent != null) .map(v -> v.cnaEvent) .collect(Collectors.toList()); - if (updateMode) { + if (isIncrementalUpdateMode) { DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedSampleList); } CnaUtil.storeCnaEvents(existingCnaEvents, events); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java index 1fb5c0d0..ee05a97f 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java @@ -66,7 +66,7 @@ public class ImportCopyNumberSegmentData extends ConsoleRunnable { private int entriesSkipped; - private boolean updateMode; + private boolean isIncrementalUpdateMode; private Set processedSampleIds; private void importData(File file, int cancerStudyId) throws IOException, DaoException { @@ -118,7 +118,7 @@ private void importData(File file, int cancerStudyId) throws IOException, DaoExc DaoCopyNumberSegment.addCopyNumberSegment(cns); processedSampleIds.add(s.getInternalId()); } - if (updateMode) { + if (isIncrementalUpdateMode) { DaoCopyNumberSegment.deleteSegmentDataForSamples(cancerStudyId, processedSampleIds); } MySQLbulkLoader.flushAll(); @@ -135,7 +135,7 @@ public void run() { OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); String dataFile = (String) options.valueOf("data"); File descriptorFile = new File((String) options.valueOf("meta")); - updateMode = options.has("overwrite-existing"); + isIncrementalUpdateMode = options.has("overwrite-existing"); Properties properties = new Properties(); properties.load(new FileInputStream(descriptorFile)); @@ -144,13 +144,13 @@ public void run() { CancerStudy cancerStudy = getCancerStudy(properties); - if (!updateMode && segmentDataExistsForCancerStudy(cancerStudy)) { + if (!isIncrementalUpdateMode && segmentDataExistsForCancerStudy(cancerStudy)) { throw new IllegalArgumentException("Seg data for cancer study " + cancerStudy.getCancerStudyStableId() + " has already been imported: " + dataFile); } importCopyNumberSegmentFileMetadata(cancerStudy, properties); importCopyNumberSegmentFileData(cancerStudy, dataFile); - DaoCopyNumberSegment.createFractionGenomeAlteredClinicalData(cancerStudy.getInternalId(), processedSampleIds, updateMode); + DaoCopyNumberSegment.createFractionGenomeAlteredClinicalData(cancerStudy.getInternalId(), processedSampleIds, isIncrementalUpdateMode); if( MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); } @@ -189,7 +189,7 @@ private void importCopyNumberSegmentFileMetadata(CancerStudy cancerStudy, Proper copyNumSegFile.description = properties.getProperty("description").trim(); copyNumSegFile.filename = properties.getProperty("data_filename").trim(); CopyNumberSegmentFile storedCopyNumSegFile = DaoCopyNumberSegmentFile.getCopyNumberSegmentFile(cancerStudy.getInternalId()); - if (updateMode && storedCopyNumSegFile != null) { + if (isIncrementalUpdateMode && storedCopyNumSegFile != null) { if (storedCopyNumSegFile.referenceGenomeId != copyNumSegFile.referenceGenomeId) { throw new IllegalStateException("You are trying to upload " + copyNumSegFile.referenceGenomeId diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index 64e6de77..b40a8d09 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -50,20 +50,20 @@ public class ImportStructuralVariantData { private final Integer genePanelId; private final Set namespaces; - private final boolean updateMode; + private final boolean isIncrementalUpdateMode; public ImportStructuralVariantData( File structuralVariantFile, int geneticProfileId, String genePanel, Set namespaces, - boolean updateMode + boolean isIncrementalUpdateMode ) throws DaoException { this.structuralVariantFile = structuralVariantFile; this.geneticProfileId = geneticProfileId; this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); this.namespaces = namespaces; - this.updateMode = updateMode; + this.isIncrementalUpdateMode = isIncrementalUpdateMode; } public void importData() throws IOException, DaoException { @@ -183,7 +183,7 @@ public void importData() throws IOException, DaoException { } } // TODO the dao methods could receive a set of sample ids (like the deletion does) instead of looping - if (updateMode) { + if (isIncrementalUpdateMode) { for (Integer sampleId : sampleIds) { DaoSampleProfile.updateSampleProfile(sampleId, geneticProfileId, genePanelId); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 1a8a9df1..b41191cd 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -102,7 +102,7 @@ public class ImportTabDelimData { private DaoGeneOptimized daoGene; - private boolean updateMode; + private boolean isIncrementalUpdateMode; private ArrayList orderedSampleList; private final Integer genePanelId; @@ -115,7 +115,7 @@ public class ImportTabDelimData { * @param geneticProfileId GeneticProfile ID. * @param genePanel GenePanel * @param genericEntityProperties Generic Assay Entities. - * @param updateMode if true, update/append data to the existing one + * @param isIncrementalUpdateMode if true, update/append data to the existing one * * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ @@ -125,10 +125,10 @@ public ImportTabDelimData( int geneticProfileId, String genePanel, String genericEntityProperties, - boolean updateMode, + boolean isIncrementalUpdateMode, DaoGeneOptimized daoGene ) { - this(dataFile, targetLine, geneticProfileId, genePanel, updateMode, daoGene); + this(dataFile, targetLine, geneticProfileId, genePanel, isIncrementalUpdateMode, daoGene); this.genericEntityProperties = genericEntityProperties; } @@ -139,7 +139,7 @@ public ImportTabDelimData( * @param targetLine The line we want to import. * If null, all lines are imported. * @param geneticProfileId GeneticProfile ID. - * @param updateMode if true, update/append data to the existing one + * @param isIncrementalUpdateMode if true, update/append data to the existing one * * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ @@ -148,17 +148,17 @@ public ImportTabDelimData( String targetLine, int geneticProfileId, String genePanel, - boolean updateMode, + boolean isIncrementalUpdateMode, DaoGeneOptimized daoGene ) { - this(dataFile, geneticProfileId, genePanel, updateMode, daoGene); + this(dataFile, geneticProfileId, genePanel, isIncrementalUpdateMode, daoGene); this.targetLine = targetLine; } /** * Constructor. * - * @param updateMode if true, update/append data to the existing one + * @param isIncrementalUpdateMode if true, update/append data to the existing one * @param dataFile Data File containing Copy Number Alteration, MRNA Expression Data, or protein RPPA data * @param geneticProfileId GeneticProfile ID. */ @@ -166,16 +166,16 @@ public ImportTabDelimData( File dataFile, int geneticProfileId, String genePanel, - boolean updateMode, + boolean isIncrementalUpdateMode, DaoGeneOptimized daoGene ) { this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - this.updateMode = updateMode; + this.isIncrementalUpdateMode = isIncrementalUpdateMode; this.daoGene = daoGene; this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); - if (this.updateMode + if (this.isIncrementalUpdateMode && geneticProfile != null && this.geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENESET_SCORE) { throw new UnsupportedOperationException("Incremental upload of geneset scores is not supported."); @@ -303,7 +303,7 @@ private void doImportData() throws IOException, DaoException { } ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines - 1)); - this.geneticAlterationImporter = updateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) + this.geneticAlterationImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) : new GeneticAlterationImporterImpl(geneticProfileId, orderedSampleList); //cache for data found in cna_event' table: @@ -410,7 +410,7 @@ private void doImportData() throws IOException, DaoException { } private void ensureSampleProfileExists(Sample sample) throws DaoException { - if (updateMode) { + if (isIncrementalUpdateMode) { upsertSampleProfile(sample); } else { createSampleProfileIfNotExists(sample); @@ -677,7 +677,7 @@ private boolean saveLine(String[] values, recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored && isDiscretizedCnaProfile) { - if (updateMode) { + if (isIncrementalUpdateMode) { DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedSampleList); } long entrezGeneId = genes.get(0).getEntrezGeneId(); From 111f58e0f635c95d68bb5e2501c1a830f0749a87 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 15:25:52 +0200 Subject: [PATCH 098/130] Improve description of overwrite-existing flag for gene panel profile map --- .../mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java index ada71ec8..ee8f78be 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java @@ -87,7 +87,7 @@ public void run() { // supported by the uploader already. Added for uniformity, to do not cause error when upstream software uses this flag parser.accepts("overwrite-existing", - "Enables overwriting data if it turns out it already exists in DB.") + "Enables re-uploading gene panel profile map data that already exists.") .withOptionalArg().describedAs("overwrite-existing").ofType(String.class); OptionSet options; try { From c4d5eccf742a80cf7c4afcd087999cd2badae5a8 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 15:58:38 +0200 Subject: [PATCH 099/130] Implement more optimal way to update sample profile --- .../cbio/portal/dao/DaoSampleProfile.java | 32 ++++++++++++++++++- .../scripts/ImportStructuralVariantData.java | 5 +-- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java index 5e895206..9551150e 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java @@ -142,7 +142,37 @@ public static void updateSampleProfile(Integer sampleId, Integer geneticProfileI JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs); } } - + + public static void upsertSampleProfile(Collection sampleIds, Integer geneticProfileId, Integer panelId) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + try { + con = JdbcUtil.getDbConnection(DaoSampleProfile.class); + + pstmt = con.prepareStatement + ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`)" + + " VALUES" + + String.join(",", Collections.nCopies(sampleIds.size(), " (?,?,?)")) + + " ON DUPLICATE KEY UPDATE `PANEL_ID` = VALUES(`PANEL_ID`);"); + int parameterIndex = 1; + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); + pstmt.setInt(parameterIndex++, geneticProfileId); + if (panelId != null) { + pstmt.setInt(parameterIndex, panelId); + } else { + pstmt.setNull(parameterIndex, java.sql.Types.INTEGER); + } + parameterIndex++; + } + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, null); + } + } + public static boolean sampleExistsInGeneticProfile(int sampleId, int geneticProfileId) throws DaoException { Connection con = null; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index b40a8d09..39deb559 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -182,11 +182,8 @@ public void importData() throws IOException, DaoException { } } } - // TODO the dao methods could receive a set of sample ids (like the deletion does) instead of looping if (isIncrementalUpdateMode) { - for (Integer sampleId : sampleIds) { - DaoSampleProfile.updateSampleProfile(sampleId, geneticProfileId, genePanelId); - } + DaoSampleProfile.upsertSampleProfile(sampleIds, geneticProfileId, genePanelId); DaoStructuralVariant.deleteStructuralVariants(geneticProfileId, sampleIds); } else { for (Integer sampleId : sampleIds) { From 13eb147cf5c8c607c72e4f21a0b81beecebc107b Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 17:50:58 +0200 Subject: [PATCH 100/130] Optimize code by always using batch upsert for sample profile --- .../cbio/portal/dao/DaoSampleProfile.java | 129 ++++-------------- .../scripts/ImportCnaDiscreteLongData.java | 57 +------- .../scripts/ImportExtendedMutationData.java | 50 ++----- .../scripts/ImportGenePanelProfileMap.java | 17 ++- .../ImportGenericAssayPatientLevelData.java | 11 +- .../scripts/ImportStructuralVariantData.java | 15 +- .../portal/scripts/ImportTabDelimData.java | 20 +-- .../dao/TestDaoSampleProfile.java | 9 +- ...stIncrementalStructuralVariantsImport.java | 6 +- 9 files changed, 65 insertions(+), 249 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java index 9551150e..0f2d8f28 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java @@ -32,12 +32,22 @@ package org.mskcc.cbio.portal.dao; -import org.mskcc.cbio.portal.model.*; - import org.apache.commons.lang3.StringUtils; - -import java.sql.*; -import java.util.*; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; + +import java.sql.Connection; +import java.sql.PreparedStatement; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.util.ArrayList; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.NoSuchElementException; /** * Data access object for sample_profile table @@ -50,100 +60,19 @@ public final class DaoSampleProfile { private DaoSampleProfile() {} private static final int NO_SUCH_PROFILE_ID = -1; - private static final String TABLE_NAME = "sample_profile"; - - public static int addSampleProfile(Integer sampleId, Integer geneticProfileId, Integer panelId) throws DaoException { - if (MySQLbulkLoader.isBulkLoad()) { - - // Add new record using bulk loader. Order of fields is: - // 1. sample ID - // 2. genetic Profile ID - // 3. gene panel ID - if (panelId != null) { - MySQLbulkLoader.getMySQLbulkLoader(TABLE_NAME).insertRecord( - Integer.toString(sampleId), - Integer.toString(geneticProfileId), - Integer.toString(panelId)); - } else { - MySQLbulkLoader.getMySQLbulkLoader(TABLE_NAME).insertRecord( - Integer.toString(sampleId), - Integer.toString(geneticProfileId), - null); - } - - return 1; - } - // Add new record without using bulk loader - Connection con = null; - PreparedStatement pstmt = null; - ResultSet rs = null; - - try { - con = JdbcUtil.getDbConnection(DaoSampleProfile.class); - pstmt = con.prepareStatement - ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`) VALUES (?,?,?)"); - pstmt.setInt(1, sampleId); - pstmt.setInt(2, geneticProfileId); - if (panelId != null) { - pstmt.setInt(3, panelId); - } - else { - pstmt.setNull(3, java.sql.Types.INTEGER); - } - return pstmt.executeUpdate(); - } catch (NullPointerException | SQLException e) { - throw new DaoException(e); - } finally { - JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs); - } + public static void upsertSampleProfiles(Collection sampleIds, Integer geneticProfileId, Integer panelId) throws DaoException { + upsertSampleProfiles( + sampleIds.stream() + .map(sampleId -> new SampleProfileTuple(geneticProfileId, sampleId, panelId)).toList()); } - public static void updateSampleProfile(Integer sampleId, Integer geneticProfileId, Integer panelId) throws DaoException { - /** - * Update a record in the sample_profile table when adding gene panel field from the sample profile matrix. - * Can not use the bulk loader, because the sample might already be added, which requires an UPDATE of the - * record. - */ - Connection con = null; - PreparedStatement pstmt = null; - ResultSet rs = null; + public record SampleProfileTuple(int geneticProfileId, int sampleId, Integer panelId) {} - try { - con = JdbcUtil.getDbConnection(DaoSampleProfile.class); - if (!sampleExistsInGeneticProfile(sampleId, geneticProfileId)) { - - pstmt = con.prepareStatement - ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`) VALUES (?,?,?)"); - pstmt.setInt(1, sampleId); - pstmt.setInt(2, geneticProfileId); - if (panelId != null) { - pstmt.setInt(3, panelId); - } else { - pstmt.setNull(3, java.sql.Types.INTEGER); - } - } else { - pstmt = con.prepareStatement - ("UPDATE `sample_profile` SET `PANEL_ID` = ? WHERE (`SAMPLE_ID` = ? AND `GENETIC_PROFILE_ID` = ?)"); - if (panelId != null) { - pstmt.setInt(1, panelId); - } else { - pstmt.setNull(1, java.sql.Types.INTEGER); - } - pstmt.setInt(2, sampleId); - pstmt.setInt(3, geneticProfileId); - } - pstmt.executeUpdate(); - } catch (NullPointerException e) { - throw new DaoException(e); - } catch (SQLException e) { - throw new DaoException(e); - } finally { - JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs); + public static void upsertSampleProfiles(Collection idTuples) throws DaoException { + if (idTuples.isEmpty()) { + return; } - } - - public static void upsertSampleProfile(Collection sampleIds, Integer geneticProfileId, Integer panelId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; try { @@ -152,14 +81,14 @@ public static void upsertSampleProfile(Collection sampleIds, Integer ge pstmt = con.prepareStatement ("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`)" + " VALUES" + - String.join(",", Collections.nCopies(sampleIds.size(), " (?,?,?)")) + + String.join(",", Collections.nCopies(idTuples.size(), " (?,?,?)")) + " ON DUPLICATE KEY UPDATE `PANEL_ID` = VALUES(`PANEL_ID`);"); int parameterIndex = 1; - for (Integer sampleId : sampleIds) { - pstmt.setInt(parameterIndex++, sampleId); - pstmt.setInt(parameterIndex++, geneticProfileId); - if (panelId != null) { - pstmt.setInt(parameterIndex, panelId); + for (SampleProfileTuple idTuple : idTuples) { + pstmt.setInt(parameterIndex++, idTuple.sampleId()); + pstmt.setInt(parameterIndex++, idTuple.geneticProfileId()); + if (idTuple.panelId() != null) { + pstmt.setInt(parameterIndex, idTuple.panelId()); } else { pstmt.setNull(parameterIndex, java.sql.Types.INTEGER); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 781bc017..6bc7f226 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -75,7 +75,6 @@ public class ImportCnaDiscreteLongData { private GeneticProfile geneticProfile; - private final ArrayList sampleIdGeneticProfileIds = new ArrayList<>(); private ArrayList orderedSampleList; private final Integer genePanelId; @@ -154,6 +153,7 @@ private void doImportData() throws Exception { orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); this.geneticAlterationGeneImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) : new GeneticAlterationImporterImpl(geneticProfileId, orderedSampleList); + DaoSampleProfile.upsertSampleProfiles(orderedSampleList, geneticProfileId, genePanelId); for (Long entrezId : toImport.eventsTable.rowKeySet()) { boolean added = storeGeneticAlterations(toImport, entrezId); @@ -206,7 +206,6 @@ public void extractDataToImport( } throw new RuntimeException("Sample with stable id " + sampleIdStr + " is not found in the database."); } - ensureSampleProfileExists(sample); long entrezId = gene.getEntrezGeneId(); int sampleId = sample.getInternalId(); @@ -223,18 +222,6 @@ public void extractDataToImport( } - private void ensureSampleProfileExists(Sample sample) throws DaoException { - if (isIncrementalUpdateMode) { - upsertSampleProfile(sample); - } else { - createSampleProfileIfNotExists(sample); - } - } - - private void upsertSampleProfile(Sample sample) throws DaoException { - DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); - } - /** * Store all cna events related to a single gene */ @@ -345,48 +332,6 @@ private CanonicalGene getGene( return null; } - /** - * Find sample and create sample profile when needed - * - * @return boolean created or not - */ - public boolean createSampleProfileIfNotExists( - Sample sample - ) throws DaoException { - boolean inDatabase = DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId); - SampleIdGeneticProfileId toCreate = new SampleIdGeneticProfileId(sample.getInternalId(), geneticProfileId); - boolean isQueued = this.sampleIdGeneticProfileIds.contains(toCreate); - if (!inDatabase && !isQueued) { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); - this.sampleIdGeneticProfileIds.add(toCreate); - return true; - } - return false; - } - - - private static class SampleIdGeneticProfileId { - public int sampleId; - public int geneticProfileId; - - public SampleIdGeneticProfileId(int sampleId, int geneticProfileId) { - this.sampleId = sampleId; - this.geneticProfileId = geneticProfileId; - } - - @Override - public boolean equals(Object o) { - if (this == o) - return true; - if (o == null || getClass() != o.getClass()) - return false; - - SampleIdGeneticProfileId that = (SampleIdGeneticProfileId) o; - return sampleId == that.sampleId - && geneticProfileId == that.geneticProfileId; - } - } - /** * Find sample and create sample profile when needed */ diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 5a92d2e2..24891046 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -32,8 +32,6 @@ package org.mskcc.cbio.portal.scripts; -import com.fasterxml.jackson.core.JsonProcessingException; -import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.lang3.StringUtils; import org.mskcc.cbio.maf.MafRecord; import org.mskcc.cbio.maf.MafUtil; @@ -97,6 +95,8 @@ public class ImportExtendedMutationData { private int entriesSkipped = 0; private int samplesSkipped = 0; private Set sampleSet = new HashSet(); + private Set internalSampleIds = new HashSet(); + private Set geneSet = new HashSet(); private Set filteredMutations = new HashSet(); private Set namespaces = new HashSet(); @@ -457,9 +457,6 @@ public void importData() throws IOException, DaoException { } else { mutations.put(mutation,mutation); } - if(!sampleSet.contains(sample.getStableId())) { - ensureSampleProfileExists(sample); - } // update ascn object with mutation unique key details if (ascn != null){ ascn.updateAscnUniqueKeyDetails(mutation); @@ -468,6 +465,7 @@ public void importData() throws IOException, DaoException { //keep track: sampleSet.add(sample.getStableId()); + internalSampleIds.add(sample.getInternalId()); geneSet.add(mutation.getEntrezGeneId()+""); } else { @@ -476,6 +474,7 @@ public void importData() throws IOException, DaoException { } } } + DaoSampleProfile.upsertSampleProfiles(internalSampleIds, geneticProfileId, genePanelId); for (MutationEvent event : newEvents) { try { @@ -603,19 +602,21 @@ private String transformOMAScore( String omaScore) { private String processMAFHeader(BufferedReader buffer) throws IOException, DaoException { GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); String line = buffer.readLine().trim(); + Set internalSampleIds = new HashSet<>(); while (line.startsWith("#")) { Matcher seqSamplesMatcher = SEQUENCE_SAMPLES_REGEX.matcher(line); // line is of format #sequenced_samples: STABLE_ID STABLE_ID STABLE_ID STABLE_ID if (seqSamplesMatcher.find()) { - addSampleProfileRecords(getSequencedSamples(seqSamplesMatcher.group(1), geneticProfile)); + internalSampleIds.addAll(getSequencedInternalSampleId(seqSamplesMatcher.group(1), geneticProfile)); } line = buffer.readLine().trim(); } + DaoSampleProfile.upsertSampleProfiles(internalSampleIds, geneticProfileId, genePanelId); return line; } - private List getSequencedSamples(String sequencedSamplesIDList, GeneticProfile geneticProfile) { - ArrayList toReturn = new ArrayList(); + private Set getSequencedInternalSampleId(String sequencedSamplesIDList, GeneticProfile geneticProfile) { + Set toReturn = new HashSet<>(); for (String stableSampleID : sequencedSamplesIDList.trim().split("\\s")) { Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), StableIdUtil.getSampleId(stableSampleID)); @@ -623,44 +624,13 @@ private List getSequencedSamples(String sequencedSamplesIDList, GeneticP if (sample == null) { missingSample(stableSampleID); } - toReturn.add(sample); + toReturn.add(sample.getInternalId()); } return toReturn; } - private void addSampleProfileRecords(List sequencedSamples) throws DaoException { - for (Sample sample : sequencedSamples) { - ensureSampleProfileExists(sample); - } - if( MySQLbulkLoader.isBulkLoad()) { - MySQLbulkLoader.flushAll(); - } - } - - private void ensureSampleProfileExists(Sample sample) throws DaoException { - if (overwriteExisting) { - upsertSampleProfile(sample); - } else { - createSampleProfileIfNotExists(sample); - } - } - - private void upsertSampleProfile(Sample sample) throws DaoException { - DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); - } - - private void createSampleProfileIfNotExists(Sample sample) throws DaoException { - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); - } - } - private void missingSample(String stableSampleID) { throw new NullPointerException("Sample is not found in database (is it missing from clinical data file?): " + stableSampleID); } - private String convertMapToJsonString(Map> map) throws JsonProcessingException { - ObjectMapper mapper = new ObjectMapper(); - return mapper.writeValueAsString(map); - } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java index ee8f78be..d6812d9c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java @@ -53,9 +53,11 @@ import java.io.FileInputStream; import java.io.FileReader; import java.util.Arrays; +import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Properties; +import java.util.Set; /** * @@ -173,7 +175,9 @@ public void importData() throws Exception { Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), sampleId); row_data.remove((int)sampleIdIndex); - + + + Set sampleProfileTuples = new HashSet<>(); // Loop over the values in the row for (int i = 0; i < row_data.size(); i++) { String genePanelName = row_data.get(i); @@ -186,12 +190,13 @@ public void importData() throws Exception { } Integer genePanelId = determineGenePanelId(genePanelName); - // Add gene panel information to database - DaoSampleProfile.updateSampleProfile( - sample.getInternalId(), - profileIds.get(i), - genePanelId); + Integer geneticProfileId = profileIds.get(i); + int sampleInternalId = sample.getInternalId(); + + sampleProfileTuples.add(new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sampleInternalId, genePanelId)); } + + DaoSampleProfile.upsertSampleProfiles(sampleProfileTuples); } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index 6bb8b211..846dd1ff 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -116,18 +116,13 @@ public void importData() throws IOException, DaoException { throw new RuntimeException("Unknown patient id '" + StableIdUtil.getPatientId(patientIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath()); } else { List samples = DaoSample.getSamplesByPatientId(patient.getInternalId()); - List sampleInternalIds = samples.stream().map(sample -> sample.getInternalId()).collect(Collectors.toList()); - for (int j = 0; j < sampleInternalIds.size(); j++) { - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sampleInternalIds.get(j), geneticProfileId)) { - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - DaoSampleProfile.addSampleProfile(sampleInternalIds.get(j), geneticProfileId, genePanelID); - } - orderedSampleList.add(sampleInternalIds.get(j)); - } + samples.forEach(sample -> orderedSampleList.add(sample.getInternalId())); numSamplesInPatient[i][0] = samples.size(); sampleCount += samples.size(); } } + Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); + DaoSampleProfile.upsertSampleProfiles(orderedSampleList, geneticProfileId, genePanelID); ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines-1)); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index 39deb559..c22829ba 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -182,26 +182,17 @@ public void importData() throws IOException, DaoException { } } } + + DaoSampleProfile.upsertSampleProfiles(sampleIds, geneticProfileId, genePanelId); if (isIncrementalUpdateMode) { - DaoSampleProfile.upsertSampleProfile(sampleIds, geneticProfileId, genePanelId); DaoStructuralVariant.deleteStructuralVariants(geneticProfileId, sampleIds); - } else { - for (Integer sampleId : sampleIds) { - createSampleProfileIfNotExists(sampleId); - } } buf.close(); MySQLbulkLoader.flushAll(); } - private void createSampleProfileIfNotExists(int internalSampleId) throws DaoException { - if (!DaoSampleProfile.sampleExistsInGeneticProfile(internalSampleId, geneticProfileId)) { - DaoSampleProfile.addSampleProfile(internalSampleId, geneticProfileId, genePanelId); - } - } - - private CanonicalGene setCanonicalGene(long siteEntrezGeneId, String siteHugoSymbol, DaoGeneOptimized daoGene) { + private CanonicalGene setCanonicalGene(long siteEntrezGeneId, String siteHugoSymbol, DaoGeneOptimized daoGene) { CanonicalGene siteCanonicalGene = null; // If the Entrez Gene Id is not "NA" set the canonical gene. diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index b41191cd..895f4d6e 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -281,7 +281,6 @@ private void doImportData() throws IOException, DaoException { throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(sampleIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath()); } } - ensureSampleProfileExists(sample); orderedSampleList.add(sample.getInternalId()); if (pdAnnotationsForStableSampleIds != null) { Set> keys = new HashSet<>(pdAnnotationsForStableSampleIds.keySet()); @@ -387,6 +386,7 @@ private void doImportData() throws IOException, DaoException { line = buf.readLine(); } + DaoSampleProfile.upsertSampleProfiles(orderedSampleList, geneticProfileId, genePanelId); geneticAlterationImporter.finalise(); if (MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); @@ -409,24 +409,6 @@ private void doImportData() throws IOException, DaoException { } } - private void ensureSampleProfileExists(Sample sample) throws DaoException { - if (isIncrementalUpdateMode) { - upsertSampleProfile(sample); - } else { - createSampleProfileIfNotExists(sample); - } - } - - private void upsertSampleProfile(Sample sample) throws DaoException { - DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); - } - - private void createSampleProfileIfNotExists(Sample sample) throws DaoException { - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId); - } - } - private Map, Map> readPdAnnotations(File pdAnnotationsFile) { Map, Map> pdAnnotations = new HashMap<>(); BufferedReader reader; diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java index 705f46b6..33d4d15d 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java @@ -55,6 +55,7 @@ import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; @@ -105,8 +106,8 @@ public void testDaoSampleProfile() throws DaoException { Patient patient = DaoPatient.getPatientByCancerStudyAndPatientId(study.getInternalId(), "TCGA-12345"); Sample sample = DaoSample.getSampleByPatientAndSampleId(patient.getInternalId(), "TCGA-12345-01"); - int num = DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, null); - assertEquals(1, num); + DaoSampleProfile.upsertSampleProfiles(List.of( + new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sample.getInternalId(), null))); boolean exists = DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId); assertTrue(exists); @@ -114,8 +115,8 @@ public void testDaoSampleProfile() throws DaoException { assertEquals(geneticProfileId, DaoSampleProfile.getProfileIdForSample(sample.getInternalId())); sample = DaoSample.getSampleByPatientAndSampleId(patient.getInternalId(), "TCGA-123456-01"); - num = DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanel.getInternalId()); - assertEquals(1, num); + DaoSampleProfile.upsertSampleProfiles(List.of( + new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sample.getInternalId(), genePanel.getInternalId()))); boolean existsByPanelId = DaoSampleProfile.sampleProfileMappingExistsByPanel(genePanel.getInternalId()); assertTrue(existsByPanelId); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java index 538f89cd..80f49e67 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java @@ -29,7 +29,6 @@ import org.mskcc.cbio.portal.dao.DaoStructuralVariant; import org.mskcc.cbio.portal.dao.MySQLbulkLoader; import org.mskcc.cbio.portal.model.CancerStudy; -import org.mskcc.cbio.portal.model.ExtendedMutation; import org.mskcc.cbio.portal.model.GenePanel; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.model.Sample; @@ -41,11 +40,9 @@ import org.springframework.transaction.annotation.Transactional; import java.io.File; -import java.util.ArrayList; import java.util.List; import java.util.Optional; import java.util.Set; -import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -93,8 +90,9 @@ public void testIncrementalUpload() throws DaoException { structuralVariant.setSite2RegionNumber(2); structuralVariant.setComments("This record has to be overwritten"); DaoStructuralVariant.addStructuralVariantToBulkLoader(structuralVariant); - DaoSampleProfile.addSampleProfile(svDataSample.getInternalId(), svGeneticProfile.getGeneticProfileId(), null); MySQLbulkLoader.flushAll(); + DaoSampleProfile.upsertSampleProfiles(List.of( + new DaoSampleProfile.SampleProfileTuple(svGeneticProfile.getGeneticProfileId(), svDataSample.getInternalId(), null))); File singleTcgaSampleFolder = new File("src/test/resources/incremental/structural_variants/"); File metaFile = new File(singleTcgaSampleFolder, "meta_structural_variants.txt"); From 95c32f8eb3abb84c2d6d2deb9932dfb072c31786 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 17:52:15 +0200 Subject: [PATCH 101/130] Recognise that SEG importer always use bulkLoad --- .../cbio/portal/scripts/ImportCopyNumberSegmentData.java | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java index ee05a97f..9828cf1d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java @@ -133,6 +133,11 @@ public void run() { String description = "Import 'segment data' files"; OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); + if (options.has("loadMode") && !"bulkLoad".equals(options.valueOf("loadMode"))) { + throw new UnsupportedOperationException("This loader supports bulkLoad load mode only, but " + + options.valueOf("loadMode") + + " has been supplied."); + } String dataFile = (String) options.valueOf("data"); File descriptorFile = new File((String) options.valueOf("meta")); isIncrementalUpdateMode = options.has("overwrite-existing"); @@ -151,9 +156,6 @@ public void run() { importCopyNumberSegmentFileMetadata(cancerStudy, properties); importCopyNumberSegmentFileData(cancerStudy, dataFile); DaoCopyNumberSegment.createFractionGenomeAlteredClinicalData(cancerStudy.getInternalId(), processedSampleIds, isIncrementalUpdateMode); - if( MySQLbulkLoader.isBulkLoad()) { - MySQLbulkLoader.flushAll(); - } } catch (RuntimeException e) { throw e; } catch (IOException|DaoException e) { From e3ec5d6064c9a7b3b0b19f19e28a674738ce4469 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 18:22:45 +0200 Subject: [PATCH 102/130] Organise bulk mode flushing for SEG importer --- .../cbio/portal/scripts/ImportCopyNumberSegmentData.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java index 9828cf1d..79fe4fa0 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java @@ -70,7 +70,6 @@ public class ImportCopyNumberSegmentData extends ConsoleRunnable { private Set processedSampleIds; private void importData(File file, int cancerStudyId) throws IOException, DaoException { - MySQLbulkLoader.bulkLoadOn(); FileReader reader = new FileReader(file); BufferedReader buf = new BufferedReader(reader); try { @@ -121,7 +120,6 @@ private void importData(File file, int cancerStudyId) throws IOException, DaoExc if (isIncrementalUpdateMode) { DaoCopyNumberSegment.deleteSegmentDataForSamples(cancerStudyId, processedSampleIds); } - MySQLbulkLoader.flushAll(); } finally { buf.close(); @@ -152,9 +150,11 @@ public void run() { if (!isIncrementalUpdateMode && segmentDataExistsForCancerStudy(cancerStudy)) { throw new IllegalArgumentException("Seg data for cancer study " + cancerStudy.getCancerStudyStableId() + " has already been imported: " + dataFile); } - + MySQLbulkLoader.bulkLoadOn(); importCopyNumberSegmentFileMetadata(cancerStudy, properties); importCopyNumberSegmentFileData(cancerStudy, dataFile); + MySQLbulkLoader.flushAll(); + MySQLbulkLoader.bulkLoadOff(); DaoCopyNumberSegment.createFractionGenomeAlteredClinicalData(cancerStudy.getInternalId(), processedSampleIds, isIncrementalUpdateMode); } catch (RuntimeException e) { throw e; From fc84a410c540be471258697a746b598d507246b8 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 11 Jun 2024 18:26:57 +0200 Subject: [PATCH 103/130] Ignore case for bulkLoad load mode option as everywhere in the code --- .../mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java | 2 +- .../java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java index 79fe4fa0..69efd93d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java @@ -131,7 +131,7 @@ public void run() { String description = "Import 'segment data' files"; OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); - if (options.has("loadMode") && !"bulkLoad".equals(options.valueOf("loadMode"))) { + if (options.has("loadMode") && !"bulkLoad".equalsIgnoreCase((String) options.valueOf("loadMode"))) { throw new UnsupportedOperationException("This loader supports bulkLoad load mode only, but " + options.valueOf("loadMode") + " has been supplied."); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java index 6556c591..24903d17 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTimelineData.java @@ -136,7 +136,7 @@ public void run() { String description = "Import 'timeline' data"; OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); - if (options.has("loadMode") && !"bulkLoad".equals(options.valueOf("loadMode"))) { + if (options.has("loadMode") && !"bulkLoad".equalsIgnoreCase((String) options.valueOf("loadMode"))) { throw new UnsupportedOperationException("This loader supports bulkLoad load mode only, but " + options.valueOf("loadMode") + " has been supplied."); From 4eac2595e06f56d56ae7d3c7cffde9e26178f275 Mon Sep 17 00:00:00 2001 From: pieterlukasse Date: Thu, 13 Jun 2024 13:34:30 +0200 Subject: [PATCH 104/130] add comma to README --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index f16c9b82..f4148fd3 100644 --- a/README.md +++ b/README.md @@ -57,8 +57,7 @@ To execute an incremental upload, use the -d (or --data_directory) option instea docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -d /data/study_es_0_inc -p /data/api_json -o ``` **Note:** -While the directory should adhere to the standard cBioPortal file formats and study structure. -Incremental uploads are not supported for all data types though. +While the directory should adhere to the standard cBioPortal file formats and study structure, incremental uploads are not supported for all data types though. For instance, uploading study metadata, resources, or GSVA data incrementally is currently unsupported. This method ensures efficient updates without the need for complete study reuploads, saving time and computational resources. From d0428f83a19fc8004719b57b1a679096351da2b8 Mon Sep 17 00:00:00 2001 From: pieterlukasse Date: Thu, 13 Jun 2024 13:38:19 +0200 Subject: [PATCH 105/130] improve order comments for INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES --- scripts/importer/cbioportal_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index 9fda27d6..e4bbe041 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -365,7 +365,7 @@ class MetaFileTypes(object): }, } -# in order of they should be loaded +# order is important! This is the order in which they should be loaded: INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES = [ MetaFileTypes.PATIENT_ATTRIBUTES, MetaFileTypes.SAMPLE_ATTRIBUTES, From bf2d5395fa531c4bed059411b53fc1aa07de93c2 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 13 Jun 2024 17:12:10 +0200 Subject: [PATCH 106/130] Add join by GENETIC_PROFILE_ID column for sample_cna_event and alteration_driver_annotaiton tables --- src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java index acb29e16..c155dce7 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java @@ -131,6 +131,7 @@ public static void removeSampleCnaEvents(int cnaProfileId, List sampleI " FROM sample_cna_event" + " LEFT JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + " AND alteration_driver_annotation.`SAMPLE_ID` = sample_cna_event.`SAMPLE_ID`" + + " AND alteration_driver_annotation.`GENETIC_PROFILE_ID` = sample_cna_event.`GENETIC_PROFILE_ID`" + " WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + ")"); From 37dcc20b88904e0737dedd14261a6b7e089ddc02 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 14 Jun 2024 15:55:54 +0200 Subject: [PATCH 107/130] Check for inconsistency in sample ids and values while reading genetic alterations --- .../cbio/portal/dao/DaoGeneticAlteration.java | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index 0358b132..ce3980d8 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -236,8 +236,20 @@ public HashMap> getGeneticAlterationMapForEntit HashMap mapSampleValue = new HashMap(); int geneticEntityId = rs.getInt("GENETIC_ENTITY_ID"); String values = rs.getString("VALUES"); - //hm.debug.. - String valueParts[] = values.split(DELIM, -1); + String[] valueParts = values.split(DELIM, -1); + int valuesLength = valueParts.length; + boolean hasMeaninglessTrailingDelimiter = valuesLength - orderedSampleList.size() == 1 && valueParts[valuesLength - 1].isEmpty(); + if (hasMeaninglessTrailingDelimiter) { + // adjust value length to account for the trailing delimiter + valuesLength -= 1; + } + if (valuesLength != orderedSampleList.size()) { + throw new IllegalStateException( + "Data inconsistency detected: The length of the values for genetic profile with Id = " + + geneticProfileId + " and genetic entity with ID = " + geneticEntityId + + " (" + valuesLength + " elements) does not match the expected length of the sample list (" + + orderedSampleList.size() + " elements)."); + } for (int i = 0; i < orderedSampleList.size(); i++) { String value = valueParts[i]; Integer sampleId = orderedSampleList.get(i); From 656271652427647c85cc376f74286c7a2bc3d377 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 14 Jun 2024 16:01:38 +0200 Subject: [PATCH 108/130] Make method name to initialise transaction clearer --- src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java b/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java index 48f59d70..e75931b2 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java @@ -62,12 +62,12 @@ public class JdbcUtil { public static DataSource getDataSource() { if (dataSource == null) { dataSource = new TransactionAwareDataSourceProxy(new JdbcDataSource()); - initSpringTx(); + setupTransactionManagement(); } return dataSource; } - private static void initSpringTx() { + private static void setupTransactionManagement() { transactionManager = new DataSourceTransactionManager(dataSource); transactionTemplate = new TransactionTemplate(transactionManager); } @@ -78,7 +78,7 @@ private static void initSpringTx() { */ public static void setDataSource(DataSource value) { dataSource = value; - initSpringTx(); + setupTransactionManagement(); } public static TransactionTemplate getTransactionTemplate() { From b0a448ea62d53de83ef3c3e163db6c8c9596b55d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 14 Jun 2024 16:09:20 +0200 Subject: [PATCH 109/130] Remove TODOs that were done --- src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java index de7fe85a..b5aa293e 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java @@ -45,7 +45,6 @@ public CnaUtil(String[] headerParts, Set namespaces) { this.namespaceColumnParser = new NamespaceColumnParser(namespaces, headerParts); } - // TODO inc: update public static void storeCnaEvents( Set existingCnaEvents, List cnaEventsToAdd @@ -54,7 +53,6 @@ public static void storeCnaEvents( if (!CNA.AMP.equals(cnaEvent.getAlteration()) && !CNA.HOMDEL.equals(cnaEvent.getAlteration())) { continue; } - // TODO Clean cnv event // Revert PR https://github.com/cBioPortal/cbioportal-core/pull/1 breaks importer Optional existingCnaEvent = existingCnaEvents .stream() From f3d76c74e2da49c29de22cc19f78bfe2c11c05a1 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 14 Jun 2024 16:12:08 +0200 Subject: [PATCH 110/130] Rename isInfoLine util. method to isDataLine I got feedback that "info line" sounds like the header metadata lines starting with # --- .../java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java | 2 +- .../cbio/portal/scripts/ImportCnaDiscreteLongData.java | 2 +- .../java/org/mskcc/cbio/portal/scripts/ImportDrugs.java | 2 +- .../cbio/portal/scripts/ImportExtendedMutationData.java | 2 +- .../mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java | 2 +- .../portal/scripts/ImportGenericAssayPatientLevelData.java | 2 +- .../cbio/portal/scripts/ImportStructuralVariantData.java | 2 +- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 2 +- .../java/org/mskcc/cbio/portal/scripts/ImportUsers.java | 2 +- src/main/java/org/mskcc/cbio/portal/util/FileUtil.java | 2 +- src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java | 6 +++--- src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java | 2 +- 12 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java b/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java index 686fccc8..d9cfad9d 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/CutInvalidCases.java @@ -110,7 +110,7 @@ private HashSet getExcludedCases() throws IOException { HashSet excludedCaseSet = new HashSet(); while (line != null) { - if (TsvUtil.isInfoLine(line)) { + if (TsvUtil.isDataLine(line)) { String parts[] = line.split("\t"); excludedCaseSet.add(parts[0]); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 6bc7f226..fe4252d4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -184,7 +184,7 @@ public void extractDataToImport( int lineIndex, CnaImportData importContainer ) throws Exception { - if (!TsvUtil.isInfoLine(line)) { + if (!TsvUtil.isDataLine(line)) { return; } String[] lineParts = TsvUtil.splitTsvLine(line); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java index dab65f56..f05362ff 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportDrugs.java @@ -65,7 +65,7 @@ public void importData() throws IOException, DaoException { while (line != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if (TsvUtil.isInfoLine(line)) { + if (TsvUtil.isDataLine(line)) { line = line.trim(); String parts[] = line.split("\t"); String geneSymbol = parts[0]; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 24891046..f392e7f7 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -193,7 +193,7 @@ public void importData() throws IOException, DaoException { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if(TsvUtil.isInfoLine(line)) + if(TsvUtil.isDataLine(line)) { String[] parts = TsvUtil.splitTsvLine(line); MafRecord record = mafUtil.parseRecord(line); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java index bc91f60d..f102c420 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java @@ -186,7 +186,7 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera currentLine = buf.readLine(); while (currentLine != null) { - if (!TsvUtil.isInfoLine(currentLine)) { + if (!TsvUtil.isDataLine(currentLine)) { currentLine = buf.readLine(); continue; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index 846dd1ff..88d1164c 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -185,7 +185,7 @@ private boolean parseGenericAssayLine(String line, int nrColumns, int patientSta boolean recordIsStored = false; - if (TsvUtil.isInfoLine(line)) { + if (TsvUtil.isDataLine(line)) { String[] parts = TsvUtil.splitTsvLine(line); if (parts.length > nrColumns) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index c22829ba..c6cd7b54 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -83,7 +83,7 @@ public void importData() throws IOException, DaoException { while ((line = buf.readLine()) != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if(TsvUtil.isInfoLine(line)) { + if(TsvUtil.isDataLine(line)) { recordCount++; String parts[] = TsvUtil.splitTsvLine(line); StructuralVariant structuralVariant = structuralVariantUtil.parseStructuralVariantRecord(parts); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 895f4d6e..dd46b637 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -326,7 +326,7 @@ private void doImportData() throws IOException, DaoException { ConsoleUtil.showProgress(); boolean recordAdded = false; - if (TsvUtil.isInfoLine(line)) { + if (TsvUtil.isDataLine(line)) { String[] rowParts = TsvUtil.splitTsvLine(line); TsvUtil.ensureHeaderAndRowMatch(headerParts, rowParts); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java index b76ec3cb..fbaa030f 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportUsers.java @@ -78,7 +78,7 @@ public static void main(String[] args) throws Exception { while (line != null) { ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); - if (TsvUtil.isInfoLine(line)) { + if (TsvUtil.isDataLine(line)) { try { addUser(line); count++; diff --git a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java index 75bf743e..4f0958ee 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java @@ -56,7 +56,7 @@ public static int getNumLines(File file) throws IOException { try (FileReader reader = new FileReader(file); BufferedReader buffered = new BufferedReader(reader)) { String line = buffered.readLine(); while (line != null) { - if (TsvUtil.isInfoLine(line)) { + if (TsvUtil.isDataLine(line)) { numLines++; } line = buffered.readLine(); diff --git a/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java index 5402d560..a07dc52e 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java @@ -6,12 +6,12 @@ */ public class TsvUtil { /** - * Does line brings any information? - * e.g. blank like and comments do not + * is the line has some data + * e.g. blank line and comments do not * @param line * @return */ - public static boolean isInfoLine(String line) { + public static boolean isDataLine(String line) { return !line.startsWith("#") && line.trim().length() > 0; } diff --git a/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java b/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java index 3577af14..d5229172 100644 --- a/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java +++ b/src/test/java/org/mskcc/cbio/portal/util/TestMafUtil.java @@ -270,7 +270,7 @@ public void testResolveAscnAnnotationNamespace() throws Exception { List ascnRecords = new ArrayList<>(); while((line=buf.readLine()) != null) { - if (TsvUtil.isInfoLine(line)) { + if (TsvUtil.isDataLine(line)) { MafRecord record = mafUtil.parseRecord(line); // every record in test MAF should have ASCN data Assert.assertTrue(record.getNamespacesMap().containsKey(ASCN_NAMESPACE)); From f5448470287ca94f71112abc216acab7d68a2ed3 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 14 Jun 2024 16:41:59 +0200 Subject: [PATCH 111/130] Simplify code by using inheritence instead of composition --- .../scripts/GeneticAlterationImporter.java | 94 +++++++++++++++- .../GeneticAlterationImporterImpl.java | 105 ------------------ .../GeneticAlterationIncrementalImporter.java | 26 ++--- .../scripts/ImportCnaDiscreteLongData.java | 2 +- .../portal/scripts/ImportTabDelimData.java | 2 +- 5 files changed, 102 insertions(+), 127 deletions(-) delete mode 100644 src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporterImpl.java diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index f0990dad..e0a3d0cc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -1,20 +1,102 @@ package org.mskcc.cbio.portal.scripts; import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.ProgressMonitor; -public interface GeneticAlterationImporter { +import java.util.HashSet; +import java.util.List; +import java.util.Set; - boolean store( +import static java.lang.String.format; + +public class GeneticAlterationImporter { + + protected int geneticProfileId; + protected List orderedSampleList; + private final Set importSetOfGenes = new HashSet<>(); + private final Set importSetOfGeneticEntityIds = new HashSet<>(); + + private final DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); + + protected GeneticAlterationImporter() {} + public GeneticAlterationImporter( + int geneticProfileId, + List orderedSampleList + ) throws DaoException { + this.geneticProfileId = geneticProfileId; + this.orderedSampleList = orderedSampleList; + storeOrderedSampleList(); + } + + protected void storeOrderedSampleList() throws DaoException { + int rowCount = DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + if (rowCount < 1) { + throw new IllegalStateException("Failed to store the ordered sample list."); + } + } + + /** + * Check that we have not already imported information regarding this gene. + * This is an important check, because a GISTIC or RAE file may contain + * multiple rows for the same gene, and we only want to import the first row. + */ + public boolean store( String[] values, CanonicalGene gene, String geneSymbol - ) throws DaoException; + ) throws DaoException { + ensureNumberOfValuesIsCorrect(values.length); + if (importSetOfGenes.add(gene.getEntrezGeneId())) { + daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values); + return true; + } + String geneSymbolMessage = ""; + if (geneSymbol != null && !geneSymbol.equalsIgnoreCase(gene.getHugoGeneSymbolAllCaps())) { + geneSymbolMessage = " (given as alias in your file as: " + geneSymbol + ")"; + } + ProgressMonitor.logWarning(format( + "Gene %s (%d)%s found to be duplicated in your file. Duplicated row will be ignored!", + gene.getHugoGeneSymbolAllCaps(), + gene.getEntrezGeneId(), + geneSymbolMessage) + ); + return false; + } + - boolean store( + /** + * Universal method that stores values for different genetic entities + * @param geneticEntityId + * @param values + * @return true if entity has been stored, false - if entity already existed + * @throws DaoException + */ + public boolean store( int geneticEntityId, String[] values - ) throws DaoException; + ) throws DaoException { + ensureNumberOfValuesIsCorrect(values.length); + if (importSetOfGeneticEntityIds.add(geneticEntityId)) { + daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfileId, geneticEntityId, values); + return true; + } + ProgressMonitor.logWarning("Data for genetic entity with id " + geneticEntityId + " already imported from file. Record will be skipped."); + return false; + } + + private void ensureNumberOfValuesIsCorrect(int valuesNumber) { + if (valuesNumber != orderedSampleList.size()) { + throw new IllegalArgumentException("There has to be " + orderedSampleList.size() + " values, but only " + valuesNumber+ " has passed."); + } + } + + public boolean isImportedAlready(CanonicalGene gene) { + return importSetOfGenes.contains(gene.getEntrezGeneId()); + } + - void finalise(); + public void finalise() { } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporterImpl.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporterImpl.java deleted file mode 100644 index 7589d3f8..00000000 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporterImpl.java +++ /dev/null @@ -1,105 +0,0 @@ -package org.mskcc.cbio.portal.scripts; - -import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; -import org.mskcc.cbio.portal.dao.DaoGeneticProfileSamples; -import org.mskcc.cbio.portal.model.CanonicalGene; -import org.mskcc.cbio.portal.util.ProgressMonitor; - -import java.util.HashSet; -import java.util.List; -import java.util.Set; - -import static java.lang.String.format; - -public class GeneticAlterationImporterImpl implements GeneticAlterationImporter { - - private final int geneticProfileId; - private final Set importSetOfGenes = new HashSet<>(); - private final Set importSetOfGeneticEntityIds = new HashSet<>(); - - private final List orderedSampleList; - - private final DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - - public GeneticAlterationImporterImpl( - int geneticProfileId, - List orderedSampleList - ) throws DaoException { - this.geneticProfileId = geneticProfileId; - this.orderedSampleList = orderedSampleList; - storeOrderedSampleList(); - } - - private void storeOrderedSampleList() throws DaoException { - int rowCount = DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); - if (rowCount < 1) { - throw new IllegalStateException("Failed to store the ordered sample list."); - } - } - - /** - * Check that we have not already imported information regarding this gene. - * This is an important check, because a GISTIC or RAE file may contain - * multiple rows for the same gene, and we only want to import the first row. - */ - @Override - public boolean store( - String[] values, - CanonicalGene gene, - String geneSymbol - ) throws DaoException { - ensureNumberOfValuesIsCorrect(values.length); - if (importSetOfGenes.add(gene.getEntrezGeneId())) { - daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values); - return true; - } - String geneSymbolMessage = ""; - if (geneSymbol != null && !geneSymbol.equalsIgnoreCase(gene.getHugoGeneSymbolAllCaps())) { - geneSymbolMessage = " (given as alias in your file as: " + geneSymbol + ")"; - } - ProgressMonitor.logWarning(format( - "Gene %s (%d)%s found to be duplicated in your file. Duplicated row will be ignored!", - gene.getHugoGeneSymbolAllCaps(), - gene.getEntrezGeneId(), - geneSymbolMessage) - ); - return false; - } - - - /** - * Universal method that stores values for different genetic entities - * @param geneticEntityId - * @param values - * @return true if entity has been stored, false - if entity already existed - * @throws DaoException - */ - @Override - public boolean store( - int geneticEntityId, - String[] values - ) throws DaoException { - ensureNumberOfValuesIsCorrect(values.length); - if (importSetOfGeneticEntityIds.add(geneticEntityId)) { - daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfileId, geneticEntityId, values); - return true; - } - ProgressMonitor.logWarning("Data for genetic entity with id " + geneticEntityId + " already imported from file. Record will be skipped."); - return false; - } - - private void ensureNumberOfValuesIsCorrect(int valuesNumber) { - if (valuesNumber != orderedSampleList.size()) { - throw new IllegalArgumentException("There has to be " + orderedSampleList.size() + " values, but only " + valuesNumber+ " has passed."); - } - } - - public boolean isImportedAlready(CanonicalGene gene) { - return importSetOfGenes.contains(gene.getEntrezGeneId()); - } - - - @Override - public void finalise() { } -} diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index cdca8413..d20e7123 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -13,12 +13,9 @@ import java.util.List; import java.util.Map; -public class GeneticAlterationIncrementalImporter implements GeneticAlterationImporter { +public class GeneticAlterationIncrementalImporter extends GeneticAlterationImporter { - private final GeneticAlterationImporterImpl geneticAlterationImporter; - private final int geneticProfileId; private final List fileOrderedSampleList; - private final List extendedOrderedSampleList; private final DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); private final HashMap> geneticAlterationMap; @@ -26,7 +23,8 @@ public GeneticAlterationIncrementalImporter( int geneticProfileId, List fileOrderedSampleList ) throws DaoException { - this.geneticProfileId = geneticProfileId; + + super.geneticProfileId = geneticProfileId; this.geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, null); this.fileOrderedSampleList = fileOrderedSampleList; @@ -41,11 +39,11 @@ public GeneticAlterationIncrementalImporter( } }); // add all new sample ids at the end - this.extendedOrderedSampleList = new ArrayList<>(savedOrderedSampleList); + super.orderedSampleList = new ArrayList<>(savedOrderedSampleList); List newSampleIds = this.fileOrderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); - this.extendedOrderedSampleList.addAll(newSampleIds); + super.orderedSampleList.addAll(newSampleIds); DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(this.geneticProfileId); - this.geneticAlterationImporter = new GeneticAlterationImporterImpl(geneticProfileId, extendedOrderedSampleList); + super.storeOrderedSampleList(); } @Override @@ -53,28 +51,28 @@ public boolean store(String[] values, CanonicalGene gene, String geneSymbol) thr int geneticEntityId = gene.getGeneticEntityId(); daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfileId, geneticEntityId); String[] expandedValues = extendValues(geneticEntityId, values); - return geneticAlterationImporter.store(expandedValues, gene, geneSymbol); + return super.store(expandedValues, gene, geneSymbol); } @Override public boolean store(int geneticEntityId, String[] values) throws DaoException { daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfileId, geneticEntityId); String[] expandedValues = extendValues(geneticEntityId, values); - return geneticAlterationImporter.store(geneticEntityId, expandedValues); + return super.store(geneticEntityId, expandedValues); } @Override public void finalise() { expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); - geneticAlterationImporter.finalise(); + super.finalise(); } private String[] extendValues(int geneticEntityId, String[] values) { Map sampleIdToValue = mapWithFileOrderedSampleList(values); - String[] updatedSampleValues = new String[extendedOrderedSampleList.size()]; - for (int i = 0; i < extendedOrderedSampleList.size(); i++) { + String[] updatedSampleValues = new String[orderedSampleList.size()]; + for (int i = 0; i < orderedSampleList.size(); i++) { updatedSampleValues[i] = ""; - int sampleId = extendedOrderedSampleList.get(i); + int sampleId = orderedSampleList.get(i); if (geneticAlterationMap.containsKey(geneticEntityId)) { HashMap savedSampleIdToValue = geneticAlterationMap.get(geneticEntityId); updatedSampleValues[i] = savedSampleIdToValue.containsKey(sampleId) ? savedSampleIdToValue.remove(sampleId): ""; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index fe4252d4..8dbf8789 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -152,7 +152,7 @@ private void doImportData() throws Exception { orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); this.geneticAlterationGeneImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) - : new GeneticAlterationImporterImpl(geneticProfileId, orderedSampleList); + : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); DaoSampleProfile.upsertSampleProfiles(orderedSampleList, geneticProfileId, genePanelId); for (Long entrezId : toImport.eventsTable.rowKeySet()) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index dd46b637..065549eb 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -303,7 +303,7 @@ private void doImportData() throws IOException, DaoException { ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines - 1)); this.geneticAlterationImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) - : new GeneticAlterationImporterImpl(geneticProfileId, orderedSampleList); + : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); From ab51a4b601dfdb6df753e8a6d58e048b45967269 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 19 Jun 2024 14:33:20 +0200 Subject: [PATCH 112/130] Optimize removing genetic alterations by removing them for the whole genetic profile at once. one sql statment instead of N --- .../org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java | 6 ++---- .../scripts/GeneticAlterationIncrementalImporter.java | 3 +-- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index ce3980d8..25cd987a 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -441,19 +441,17 @@ public int getCount() throws DaoException { * Deletes all Genetic Alteration Records associated with the specified Genetic Profile ID. * * @param geneticProfileId Genetic Profile ID. - * @param geneticEntityId Genetic Entity ID. * @throws DaoException Database Error. */ - public void deleteAllRecordsInGeneticProfile(long geneticProfileId, long geneticEntityId) throws DaoException { + public void deleteAllRecordsInGeneticProfile(long geneticProfileId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoGeneticAlteration.class); pstmt = con.prepareStatement("DELETE from " + - "genetic_alteration WHERE GENETIC_PROFILE_ID=? and GENETIC_ENTITY_ID=?"); + "genetic_alteration WHERE GENETIC_PROFILE_ID=?"); pstmt.setLong(1, geneticProfileId); - pstmt.setLong(2, geneticEntityId); pstmt.executeUpdate(); } catch (SQLException e) { throw new DaoException(e); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index d20e7123..0b9b2515 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -44,19 +44,18 @@ public GeneticAlterationIncrementalImporter( super.orderedSampleList.addAll(newSampleIds); DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(this.geneticProfileId); super.storeOrderedSampleList(); + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(this.geneticProfileId); } @Override public boolean store(String[] values, CanonicalGene gene, String geneSymbol) throws DaoException { int geneticEntityId = gene.getGeneticEntityId(); - daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfileId, geneticEntityId); String[] expandedValues = extendValues(geneticEntityId, values); return super.store(expandedValues, gene, geneSymbol); } @Override public boolean store(int geneticEntityId, String[] values) throws DaoException { - daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfileId, geneticEntityId); String[] expandedValues = extendValues(geneticEntityId, values); return super.store(geneticEntityId, expandedValues); } From 96acec541d3afec7ac19f189cb1cd0a471dbcafc Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 19 Jun 2024 14:37:22 +0200 Subject: [PATCH 113/130] Access inherited variables with this. intead of super. the confusion that triggered the change: The use of super. indicates that the subclass also declares one with the same name, but you are trying to not set that somehow? --- .../scripts/GeneticAlterationIncrementalImporter.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index 0b9b2515..e8e70149 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -24,7 +24,7 @@ public GeneticAlterationIncrementalImporter( List fileOrderedSampleList ) throws DaoException { - super.geneticProfileId = geneticProfileId; + this.geneticProfileId = geneticProfileId; this.geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, null); this.fileOrderedSampleList = fileOrderedSampleList; @@ -39,11 +39,11 @@ public GeneticAlterationIncrementalImporter( } }); // add all new sample ids at the end - super.orderedSampleList = new ArrayList<>(savedOrderedSampleList); + this.orderedSampleList = new ArrayList<>(savedOrderedSampleList); List newSampleIds = this.fileOrderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); - super.orderedSampleList.addAll(newSampleIds); + this.orderedSampleList.addAll(newSampleIds); DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(this.geneticProfileId); - super.storeOrderedSampleList(); + this.storeOrderedSampleList(); daoGeneticAlteration.deleteAllRecordsInGeneticProfile(this.geneticProfileId); } From e79544944fa8b34497869bee1ef96810089cded1 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 21 Jun 2024 16:43:39 +0200 Subject: [PATCH 114/130] Remove unused code from DaoSampleList.addSampleList() --- src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java index cd5fef98..4427c0c1 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java @@ -50,7 +50,6 @@ public class DaoSampleList { public int addSampleList(SampleList sampleList) throws DaoException { Connection con = null; PreparedStatement pstmt = null; - ResultSet rs = null; int rows; try { con = JdbcUtil.getDbConnection(DaoSampleList.class); @@ -66,8 +65,7 @@ public int addSampleList(SampleList sampleList) throws DaoException { try (ResultSet generatedKey = pstmt.getGeneratedKeys()) { if (generatedKey.next()) { int listId = generatedKey.getInt(1); - int listListRow = addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con); - rows = (listListRow != -1) ? (rows + listListRow) : rows; + rows += addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con); } else { throw new DaoException("Creating sample list failed, no ID obtained."); } @@ -75,7 +73,7 @@ public int addSampleList(SampleList sampleList) throws DaoException { } catch (SQLException e) { throw new DaoException(e); } finally { - JdbcUtil.closeAll(DaoSampleList.class, con, pstmt, rs); + JdbcUtil.closeAll(DaoSampleList.class, con, pstmt, null); } return rows; From df8f7aff66ddba603a7ae3276eceec7c4f7015d6 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 21 Jun 2024 16:49:07 +0200 Subject: [PATCH 115/130] Remove extra semicolons at the end of java statements --- src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java | 2 +- .../mskcc/cbio/portal/scripts/ImportExtendedMutationData.java | 2 +- .../org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java b/src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java index 3414fc4b..84d8984e 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/MySQLbulkLoader.java @@ -180,7 +180,7 @@ public void insertRecord( String... fieldValues) { tempFileWriter.write( "\t" ); tempFileWriter.write( escapeValue(fieldValues[i]) ); } - tempFileWriter.write("\n");; + tempFileWriter.write("\n"); if( rows++ < numDebuggingRowsToPrint ){ StringBuffer sb = new StringBuffer( escapeValue(fieldValues[0]) ); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index f392e7f7..63c83022 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -119,7 +119,7 @@ public ImportExtendedMutationData(File mutationFile, int geneticProfileId, Strin this.mutationFile = mutationFile; this.geneticProfileId = geneticProfileId; this.swissprotIsAccession = false; - this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);; + this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); this.filteredMutations = filteredMutations; // create default MutationFilter diff --git a/src/test/java/org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java b/src/test/java/org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java index 29079cd6..409db2bd 100644 --- a/src/test/java/org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java +++ b/src/test/java/org/mskcc/cbio/portal/servlet/NullHttpServletResponse.java @@ -235,7 +235,7 @@ public String getContentType() { // properties /////////////////////////////////////////////////////////////// - private ServletOutputStream servletOutputStream = null; // new NullServletOutputStream();; + private ServletOutputStream servletOutputStream = null; // new NullServletOutputStream(); private StringWriter myStringWriter = new StringWriter(); public String getOutput(){ From f120f5d14e4c2928309d89a041c8fdfb1920476a Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 21 Jun 2024 16:53:16 +0200 Subject: [PATCH 116/130] Rename upsertSampleProfiles to upsertSampleToProfileMapping method in DaoSampleProfile --- .../java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java | 6 +++--- .../cbio/portal/scripts/ImportCnaDiscreteLongData.java | 2 +- .../cbio/portal/scripts/ImportExtendedMutationData.java | 4 ++-- .../cbio/portal/scripts/ImportGenePanelProfileMap.java | 2 +- .../portal/scripts/ImportGenericAssayPatientLevelData.java | 2 +- .../cbio/portal/scripts/ImportStructuralVariantData.java | 2 +- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 2 +- .../portal/integrationTest/dao/TestDaoSampleProfile.java | 4 ++-- .../TestIncrementalStructuralVariantsImport.java | 2 +- 9 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java index 0f2d8f28..5a6a9e5c 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java @@ -61,15 +61,15 @@ private DaoSampleProfile() {} private static final int NO_SUCH_PROFILE_ID = -1; - public static void upsertSampleProfiles(Collection sampleIds, Integer geneticProfileId, Integer panelId) throws DaoException { - upsertSampleProfiles( + public static void upsertSampleToProfileMapping(Collection sampleIds, Integer geneticProfileId, Integer panelId) throws DaoException { + upsertSampleToProfileMapping( sampleIds.stream() .map(sampleId -> new SampleProfileTuple(geneticProfileId, sampleId, panelId)).toList()); } public record SampleProfileTuple(int geneticProfileId, int sampleId, Integer panelId) {} - public static void upsertSampleProfiles(Collection idTuples) throws DaoException { + public static void upsertSampleToProfileMapping(Collection idTuples) throws DaoException { if (idTuples.isEmpty()) { return; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 8dbf8789..765986b5 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -153,7 +153,7 @@ private void doImportData() throws Exception { orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); this.geneticAlterationGeneImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); - DaoSampleProfile.upsertSampleProfiles(orderedSampleList, geneticProfileId, genePanelId); + DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelId); for (Long entrezId : toImport.eventsTable.rowKeySet()) { boolean added = storeGeneticAlterations(toImport, entrezId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index 63c83022..4cc8fb20 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -474,7 +474,7 @@ public void importData() throws IOException, DaoException { } } } - DaoSampleProfile.upsertSampleProfiles(internalSampleIds, geneticProfileId, genePanelId); + DaoSampleProfile.upsertSampleToProfileMapping(internalSampleIds, geneticProfileId, genePanelId); for (MutationEvent event : newEvents) { try { @@ -611,7 +611,7 @@ private String processMAFHeader(BufferedReader buffer) throws IOException, DaoEx } line = buffer.readLine().trim(); } - DaoSampleProfile.upsertSampleProfiles(internalSampleIds, geneticProfileId, genePanelId); + DaoSampleProfile.upsertSampleToProfileMapping(internalSampleIds, geneticProfileId, genePanelId); return line; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java index d6812d9c..e48df328 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenePanelProfileMap.java @@ -196,7 +196,7 @@ public void importData() throws Exception { sampleProfileTuples.add(new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sampleInternalId, genePanelId)); } - DaoSampleProfile.upsertSampleProfiles(sampleProfileTuples); + DaoSampleProfile.upsertSampleToProfileMapping(sampleProfileTuples); } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index 88d1164c..ad10fe01 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -122,7 +122,7 @@ public void importData() throws IOException, DaoException { } } Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - DaoSampleProfile.upsertSampleProfiles(orderedSampleList, geneticProfileId, genePanelID); + DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelID); ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines-1)); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java index c6cd7b54..d7e4b66a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportStructuralVariantData.java @@ -183,7 +183,7 @@ public void importData() throws IOException, DaoException { } } - DaoSampleProfile.upsertSampleProfiles(sampleIds, geneticProfileId, genePanelId); + DaoSampleProfile.upsertSampleToProfileMapping(sampleIds, geneticProfileId, genePanelId); if (isIncrementalUpdateMode) { DaoStructuralVariant.deleteStructuralVariants(geneticProfileId, sampleIds); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 065549eb..37736a96 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -386,7 +386,7 @@ private void doImportData() throws IOException, DaoException { line = buf.readLine(); } - DaoSampleProfile.upsertSampleProfiles(orderedSampleList, geneticProfileId, genePanelId); + DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelId); geneticAlterationImporter.finalise(); if (MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java index 33d4d15d..d4c80a8e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoSampleProfile.java @@ -106,7 +106,7 @@ public void testDaoSampleProfile() throws DaoException { Patient patient = DaoPatient.getPatientByCancerStudyAndPatientId(study.getInternalId(), "TCGA-12345"); Sample sample = DaoSample.getSampleByPatientAndSampleId(patient.getInternalId(), "TCGA-12345-01"); - DaoSampleProfile.upsertSampleProfiles(List.of( + DaoSampleProfile.upsertSampleToProfileMapping(List.of( new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sample.getInternalId(), null))); boolean exists = DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId); @@ -115,7 +115,7 @@ public void testDaoSampleProfile() throws DaoException { assertEquals(geneticProfileId, DaoSampleProfile.getProfileIdForSample(sample.getInternalId())); sample = DaoSample.getSampleByPatientAndSampleId(patient.getInternalId(), "TCGA-123456-01"); - DaoSampleProfile.upsertSampleProfiles(List.of( + DaoSampleProfile.upsertSampleToProfileMapping(List.of( new DaoSampleProfile.SampleProfileTuple(geneticProfileId, sample.getInternalId(), genePanel.getInternalId()))); boolean existsByPanelId = DaoSampleProfile.sampleProfileMappingExistsByPanel(genePanel.getInternalId()); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java index 80f49e67..e477d91f 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalStructuralVariantsImport.java @@ -91,7 +91,7 @@ public void testIncrementalUpload() throws DaoException { structuralVariant.setComments("This record has to be overwritten"); DaoStructuralVariant.addStructuralVariantToBulkLoader(structuralVariant); MySQLbulkLoader.flushAll(); - DaoSampleProfile.upsertSampleProfiles(List.of( + DaoSampleProfile.upsertSampleToProfileMapping(List.of( new DaoSampleProfile.SampleProfileTuple(svGeneticProfile.getGeneticProfileId(), svDataSample.getInternalId(), null))); File singleTcgaSampleFolder = new File("src/test/resources/incremental/structural_variants/"); From 602cc2432403de18c0c4c626ee46c3c205e525bb Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 21 Jun 2024 17:08:14 +0200 Subject: [PATCH 117/130] Use java 8 way to convert typed list to array in GeneticAlterationIncrementalImporter --- .../portal/scripts/GeneticAlterationIncrementalImporter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index e8e70149..2e1117a4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -87,7 +87,7 @@ private String[] extendValues(int geneticEntityId, String[] values) { } private Map mapWithFileOrderedSampleList(String[] values) { - return ArrayUtil.zip(fileOrderedSampleList.toArray(new Integer[0]), values); + return ArrayUtil.zip(fileOrderedSampleList.toArray(Integer[]::new), values); } private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { From 28dfa054af695c215a97b5bfbaedf53e92f4d9bb Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 21 Jun 2024 17:21:56 +0200 Subject: [PATCH 118/130] Improve doc comments for TsvUtil.isDataLine(String line) --- src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java index a07dc52e..0c2e61a2 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/TsvUtil.java @@ -6,10 +6,10 @@ */ public class TsvUtil { /** - * is the line has some data - * e.g. blank line and comments do not - * @param line - * @return + * Detects if the line has some data + * e.g. blank line and comments are not considered as data rows + * @param line the line to evaluate + * @return true if the line contains data, false otherwise */ public static boolean isDataLine(String line) { return !line.startsWith("#") && line.trim().length() > 0; From 2dd1e62880a2a41cbb241c1285e79d696c5f6141 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 21 Jun 2024 18:02:03 +0200 Subject: [PATCH 119/130] Rename and codument better method to updateCaseLists --- .../scripts/UpdateCaseListsSampleIds.java | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java index b8ab514e..865cc660 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/UpdateCaseListsSampleIds.java @@ -70,7 +70,7 @@ public void run() { this.caseListSampleIdToSampleIds.put(cancerStudyStableId + "_all", this.allSampleIds); Map> readCaseListSampleIds = readCaseListFiles(); this.caseListSampleIdToSampleIds.putAll(readCaseListSampleIds); - updateCaseLists(this.caseListSampleIdToSampleIds); + updateCaseListsForTheStudy(this.caseListSampleIdToSampleIds); } private Map> readCaseListFiles() { @@ -96,7 +96,20 @@ private Map> readCaseListFiles() { return result; } - private void updateCaseLists(Map> caseListSampleIdToSampleIds) { + /** + * Updates the sample lists according to the steps below: + * + * 1. New sample IDs provided in the `caseListSampleIdToSampleIds` map are added to their corresponding case lists. + * 2. These sample IDs are removed from any other case lists within the same study. + * + * @param caseListSampleIdToSampleIds A map where the key is the case list stable ID and the value is a set of sample IDs + * to be added to the corresponding case list. + * Note: This map only includes the case lists that need to be updated with new sample IDs. + * Existing case lists in the study that are not in the map will not be dropped, + * but the provided sample IDs will be removed from these lists if present. + * @throws RuntimeException if any DAO operations fail or if a case list with a specified stable ID is not found. + */ + private void updateCaseListsForTheStudy(Map> caseListSampleIdToSampleIds) { DaoCancerStudy.reCacheAll(); try { for (Map.Entry> caseListStableIdToSampleIds : caseListSampleIdToSampleIds.entrySet()) { From a8f014aaeb91a71bf87bd5b961e2ee4317d16867 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 15:52:26 +0200 Subject: [PATCH 120/130] Remove DEFINED_CANCER_TYPES global variable --- scripts/importer/validateData.py | 4 +--- tests/unit_tests_validate_data.py | 2 -- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/scripts/importer/validateData.py b/scripts/importer/validateData.py index 3747b0bd..079479ba 100755 --- a/scripts/importer/validateData.py +++ b/scripts/importer/validateData.py @@ -70,7 +70,6 @@ DEFINED_SAMPLE_IDS = None DEFINED_SAMPLE_ATTRIBUTES = None PATIENTS_WITH_SAMPLES = None -DEFINED_CANCER_TYPES = None mutation_sample_ids = None mutation_file_sample_ids = set() sample_ids_panel_dict = {} @@ -5388,7 +5387,6 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ cancer_type_validators[0].validate() studydefined_cancer_types = ( cancer_type_validators[0].defined_cancer_types) - DEFINED_CANCER_TYPES = studydefined_cancer_types # next check the cancer type of the meta_study file if cbioportal_common.MetaFileTypes.STUDY not in validators_by_meta_type: @@ -5396,7 +5394,7 @@ def validate_study(study_dir, portal_instance, logger, relaxed_mode, strict_maf_ return if portal_instance.cancer_type_dict is not None and not ( study_cancer_type in portal_instance.cancer_type_dict or - study_cancer_type in DEFINED_CANCER_TYPES): + study_cancer_type in studydefined_cancer_types): logger.error( 'Cancer type of study is neither known to the portal nor defined ' 'in a cancer_type file', diff --git a/tests/unit_tests_validate_data.py b/tests/unit_tests_validate_data.py index f583f7e6..3089d33b 100755 --- a/tests/unit_tests_validate_data.py +++ b/tests/unit_tests_validate_data.py @@ -2297,12 +2297,10 @@ class StudyCompositionTestCase(LogBufferTestCase): def setUp(self): """Store validateData globals changed by running validate_study().""" super(StudyCompositionTestCase, self).setUp() - self.orig_defined_cancer_types = validateData.DEFINED_CANCER_TYPES self.orig_defined_sample_ids = validateData.DEFINED_SAMPLE_IDS def tearDown(self): """Restore the environment to before setUp() was called.""" - validateData.DEFINED_CANCER_TYPES = self.orig_defined_cancer_types validateData.DEFINED_SAMPLE_IDS = self.orig_defined_sample_ids super(StudyCompositionTestCase, self).tearDown() From 0df91c923d996fd1e4d1c4a0c1c93b929f46abfa Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 16:32:30 +0200 Subject: [PATCH 121/130] Add docstring to sample attribute remove methods Make it explicity that function will delete any matching records "if they exist" --- .../org/mskcc/cbio/portal/dao/DaoClinicalData.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java index 4ac8f1e6..4d9bfde1 100755 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalData.java @@ -367,6 +367,12 @@ public static List getSampleData(int cancerStudyId, Collection sampleInternalIds, String attrId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; From 990218d41f943ccb60b3105495607cf128087b21 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 16:43:13 +0200 Subject: [PATCH 122/130] Add docstring to method to update fraction genome altered clinical attribute Specify that sampleIds is optional and can be set to null --- .../org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java index a71166f7..bb099498 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCopyNumberSegment.java @@ -67,6 +67,15 @@ public static int addCopyNumberSegment(CopyNumberSegment seg) throws DaoExceptio } } + /** + * Ensures FRACTION_GENOME_ALTERED clinical sample attribute is created and up to date. + * @param cancerStudyId - id of the study to create the clinical attribute for + * @param sampleIds - specifies for which samples to calculate this attribute. + * if sampleIds=null, the calculation is done for all samples in the study + * @param updateMode - if true, updates the attribute if it exists + * @throws DaoException + */ + public static void createFractionGenomeAlteredClinicalData(int cancerStudyId, Set sampleIds, boolean updateMode) throws DaoException { Connection con = null; PreparedStatement pstmt = null; From 2bbe3c5c261b1e74766394a3ba46424250a7d2af Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 17:05:17 +0200 Subject: [PATCH 123/130] Make DAO contant that hold SQL private increase incapsulation --- src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java | 4 ++-- src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java index b8d1e4f9..730a288d 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoMutation.java @@ -67,8 +67,8 @@ public final class DaoMutation { public static final String NAN = "NaN"; private static final String MUTATION_COUNT_ATTR_ID = "MUTATION_COUNT"; - public static final String DELETE_ALTERATION_DRIVER_ANNOTATION = "DELETE from alteration_driver_annotation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"; - public static final String DELETE_MUTATION = "DELETE from mutation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"; + private static final String DELETE_ALTERATION_DRIVER_ANNOTATION = "DELETE from alteration_driver_annotation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"; + private static final String DELETE_MUTATION = "DELETE from mutation WHERE GENETIC_PROFILE_ID=? and SAMPLE_ID=?"; public static int addMutation(ExtendedMutation mutation, boolean newMutationEvent) throws DaoException { if (!MySQLbulkLoader.isBulkLoad()) { diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java index 4427c0c1..f3d28683 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java @@ -42,7 +42,7 @@ */ public class DaoSampleList { - public static final String DELETE_SAMPLE_LIST_LIST = "DELETE FROM sample_list_list WHERE `LIST_ID` = ?"; + private static final String DELETE_SAMPLE_LIST_LIST = "DELETE FROM sample_list_list WHERE `LIST_ID` = ?"; /** * Adds record to sample_list table. From bc76d3b03235298e6bc6c63d172bc2138c8bd9e9 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 17:15:03 +0200 Subject: [PATCH 124/130] Stop doing rows math, it's just a status! --- src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java index f3d28683..8950b7b5 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoSampleList.java @@ -65,7 +65,7 @@ public int addSampleList(SampleList sampleList) throws DaoException { try (ResultSet generatedKey = pstmt.getGeneratedKeys()) { if (generatedKey.next()) { int listId = generatedKey.getInt(1); - rows += addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con); + addSampleListList(sampleList.getCancerStudyId(), listId, sampleList.getSampleList(), con); } else { throw new DaoException("Creating sample list failed, no ID obtained."); } From 0c0b754d1f5e526a980d6522f1c81ee02b456b0a Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 17:24:51 +0200 Subject: [PATCH 125/130] Adopt C style of incrementing jdbc paramters --- src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java index c155dce7..fbdbc6e1 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java @@ -135,9 +135,10 @@ public static void removeSampleCnaEvents(int cnaProfileId, List sampleI " WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + ")"); - pstmt.setInt(1, cnaProfileId); - for (int i = 0; i < sampleIds.size(); i++) { - pstmt.setInt(i + 2, sampleIds.get(i)); + int parameterIndex = 1; + pstmt.setInt(parameterIndex++, cnaProfileId); + for (Integer sampleId : sampleIds) { + pstmt.setInt(parameterIndex++, sampleId); } pstmt.executeUpdate(); } catch (SQLException e) { From 89d6a97bf924c41e00fb9dfab6d7fbd8a01a0f6b Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 20:43:03 +0200 Subject: [PATCH 126/130] Improve wording in error message --- .../portal/scripts/GeneticAlterationIncrementalImporter.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index 2e1117a4..755ae0ee 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -34,7 +34,7 @@ public GeneticAlterationIncrementalImporter( if (sampleToValue.size() != initialOrderSampleListSize) { throw new IllegalStateException("Number of samples (" + sampleToValue.size() + ") for genetic entity with id " - + geneticEntityId + " does not match with the number in the inital sample list (" + + geneticEntityId + " does not match with the number in the preexisting sample list (" + initialOrderSampleListSize + ")."); } }); From 0233279736fb8985b315961a98b10feb7c1196ec Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 20:50:31 +0200 Subject: [PATCH 127/130] Remove unused method of genetic alteration importer --- .../mskcc/cbio/portal/scripts/GeneticAlterationImporter.java | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index e0a3d0cc..c076a411 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -93,10 +93,5 @@ private void ensureNumberOfValuesIsCorrect(int valuesNumber) { } } - public boolean isImportedAlready(CanonicalGene gene) { - return importSetOfGenes.contains(gene.getEntrezGeneId()); - } - - public void finalise() { } } From 0e6ce2e2b83ab3d2bbf49343d27ac81373920ef1 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 20:58:23 +0200 Subject: [PATCH 128/130] Extract db communicating methods out of the constructor introduce initialise() method --- .../scripts/GeneticAlterationImporter.java | 11 ++++- .../GeneticAlterationIncrementalImporter.java | 49 +++++++++++-------- .../scripts/ImportCnaDiscreteLongData.java | 1 + .../portal/scripts/ImportTabDelimData.java | 2 + 4 files changed, 40 insertions(+), 23 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index c076a411..a9cb4e63 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -25,10 +25,9 @@ protected GeneticAlterationImporter() {} public GeneticAlterationImporter( int geneticProfileId, List orderedSampleList - ) throws DaoException { + ) { this.geneticProfileId = geneticProfileId; this.orderedSampleList = orderedSampleList; - storeOrderedSampleList(); } protected void storeOrderedSampleList() throws DaoException { @@ -93,5 +92,13 @@ private void ensureNumberOfValuesIsCorrect(int valuesNumber) { } } + + public void initialise() { + try { + storeOrderedSampleList(); + } catch (DaoException e) { + throw new RuntimeException(e); + } + } public void finalise() { } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index 755ae0ee..b6164d0b 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -17,34 +17,15 @@ public class GeneticAlterationIncrementalImporter extends GeneticAlterationImpor private final List fileOrderedSampleList; private final DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - private final HashMap> geneticAlterationMap; + private HashMap> geneticAlterationMap; public GeneticAlterationIncrementalImporter( int geneticProfileId, List fileOrderedSampleList - ) throws DaoException { + ) { this.geneticProfileId = geneticProfileId; - this.geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, null); this.fileOrderedSampleList = fileOrderedSampleList; - - ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(this.geneticProfileId); - int initialOrderSampleListSize = savedOrderedSampleList.size(); - geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { - if (sampleToValue.size() != initialOrderSampleListSize) { - throw new IllegalStateException("Number of samples (" - + sampleToValue.size() + ") for genetic entity with id " - + geneticEntityId + " does not match with the number in the preexisting sample list (" - + initialOrderSampleListSize + ")."); - } - }); - // add all new sample ids at the end - this.orderedSampleList = new ArrayList<>(savedOrderedSampleList); - List newSampleIds = this.fileOrderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); - this.orderedSampleList.addAll(newSampleIds); - DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(this.geneticProfileId); - this.storeOrderedSampleList(); - daoGeneticAlteration.deleteAllRecordsInGeneticProfile(this.geneticProfileId); } @Override @@ -60,6 +41,32 @@ public boolean store(int geneticEntityId, String[] values) throws DaoException { return super.store(geneticEntityId, expandedValues); } + @Override + public void initialise() { + try { + this.geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, null); + ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(this.geneticProfileId); + int initialOrderSampleListSize = savedOrderedSampleList.size(); + geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { + if (sampleToValue.size() != initialOrderSampleListSize) { + throw new IllegalStateException("Number of samples (" + + sampleToValue.size() + ") for genetic entity with id " + + geneticEntityId + " does not match with the number in the preexisting sample list (" + + initialOrderSampleListSize + ")."); + } + }); + // add all new sample ids at the end + this.orderedSampleList = new ArrayList<>(savedOrderedSampleList); + List newSampleIds = this.fileOrderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); + this.orderedSampleList.addAll(newSampleIds); + DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(this.geneticProfileId); + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(this.geneticProfileId); + } catch (DaoException e) { + throw new RuntimeException(e); + } + super.initialise(); + } + @Override public void finalise() { expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index 765986b5..bcad7ee8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -153,6 +153,7 @@ private void doImportData() throws Exception { orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); this.geneticAlterationGeneImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); + geneticAlterationGeneImporter.initialise(); DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelId); for (Long entrezId : toImport.eventsTable.rowKeySet()) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 37736a96..a081ee35 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -305,6 +305,8 @@ private void doImportData() throws IOException, DaoException { this.geneticAlterationImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); + geneticAlterationImporter.initialise(); + //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); if (isDiscretizedCnaProfile) { From e40fb261d94355afb15fd25550e78977a4f15c67 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 11 Jul 2024 21:08:00 +0200 Subject: [PATCH 129/130] Improve time complexity from N^2 to N --- .../portal/scripts/GeneticAlterationIncrementalImporter.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index b6164d0b..46585f48 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -12,6 +12,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Set; public class GeneticAlterationIncrementalImporter extends GeneticAlterationImporter { @@ -57,7 +58,8 @@ public void initialise() { }); // add all new sample ids at the end this.orderedSampleList = new ArrayList<>(savedOrderedSampleList); - List newSampleIds = this.fileOrderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); + Set savedSampleSet = new HashSet<>(savedOrderedSampleList); + List newSampleIds = this.fileOrderedSampleList.stream().filter(sampleId -> !savedSampleSet.contains(sampleId)).toList(); this.orderedSampleList.addAll(newSampleIds); DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(this.geneticProfileId); daoGeneticAlteration.deleteAllRecordsInGeneticProfile(this.geneticProfileId); From 641450c2d738f2fef5d46ed867447764d187689f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 12 Jul 2024 10:26:16 +0200 Subject: [PATCH 130/130] Use american english for method names --- .../cbio/portal/scripts/GeneticAlterationImporter.java | 4 ++-- .../scripts/GeneticAlterationIncrementalImporter.java | 8 ++++---- .../cbio/portal/scripts/ImportCnaDiscreteLongData.java | 4 ++-- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 4 ++-- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index a9cb4e63..e1d9a9f8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -93,12 +93,12 @@ private void ensureNumberOfValuesIsCorrect(int valuesNumber) { } - public void initialise() { + public void initialize() { try { storeOrderedSampleList(); } catch (DaoException e) { throw new RuntimeException(e); } } - public void finalise() { } + public void finalize() { } } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java index 46585f48..4177420a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationIncrementalImporter.java @@ -43,7 +43,7 @@ public boolean store(int geneticEntityId, String[] values) throws DaoException { } @Override - public void initialise() { + public void initialize() { try { this.geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, null); ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(this.geneticProfileId); @@ -66,13 +66,13 @@ public void initialise() { } catch (DaoException e) { throw new RuntimeException(e); } - super.initialise(); + super.initialize(); } @Override - public void finalise() { + public void finalize() { expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); - super.finalise(); + super.finalize(); } private String[] extendValues(int geneticEntityId, String[] values) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java index bcad7ee8..e158f2d5 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportCnaDiscreteLongData.java @@ -153,7 +153,7 @@ private void doImportData() throws Exception { orderedSampleList = newArrayList(toImport.eventsTable.columnKeySet()); this.geneticAlterationGeneImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); - geneticAlterationGeneImporter.initialise(); + geneticAlterationGeneImporter.initialize(); DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelId); for (Long entrezId : toImport.eventsTable.rowKeySet()) { @@ -171,7 +171,7 @@ private void doImportData() throws Exception { ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + getSamplesSkipped()); buf.close(); - geneticAlterationGeneImporter.finalise(); + geneticAlterationGeneImporter.finalize(); MySQLbulkLoader.flushAll(); } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index a081ee35..75143ba3 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -305,7 +305,7 @@ private void doImportData() throws IOException, DaoException { this.geneticAlterationImporter = isIncrementalUpdateMode ? new GeneticAlterationIncrementalImporter(geneticProfileId, orderedSampleList) : new GeneticAlterationImporter(geneticProfileId, orderedSampleList); - geneticAlterationImporter.initialise(); + geneticAlterationImporter.initialize(); //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); @@ -389,7 +389,7 @@ private void doImportData() throws IOException, DaoException { line = buf.readLine(); } DaoSampleProfile.upsertSampleToProfileMapping(orderedSampleList, geneticProfileId, genePanelId); - geneticAlterationImporter.finalise(); + geneticAlterationImporter.finalize(); if (MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); }