Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

(3/7) RFC79: Implement incremental upload for timeline data #43

Merged
merged 1 commit into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scripts/importer/cbioportal_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -380,6 +380,7 @@ class MetaFileTypes(object):
MetaFileTypes.GENERIC_ASSAY_CONTINUOUS,
MetaFileTypes.GENERIC_ASSAY_BINARY,
MetaFileTypes.GENERIC_ASSAY_CATEGORICAL,
MetaFileTypes.TIMELINE,
]

IMPORTER_CLASSNAME_BY_META_TYPE = {
Expand Down
27 changes: 24 additions & 3 deletions src/main/java/org/mskcc/cbio/portal/dao/DaoClinicalEvent.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@

package org.mskcc.cbio.portal.dao;

import org.apache.commons.lang3.StringUtils;
import org.mskcc.cbio.portal.model.ClinicalEvent;

import java.sql.Connection;
import java.sql.PreparedStatement;
import java.sql.ResultSet;
Expand All @@ -40,8 +43,6 @@
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.commons.lang3.StringUtils;
import org.mskcc.cbio.portal.model.ClinicalEvent;

/**
*
Expand All @@ -52,7 +53,7 @@ private DaoClinicalEvent() {}

public static int addClinicalEvent(ClinicalEvent clinicalEvent) {
if (!MySQLbulkLoader.isBulkLoad()) {
throw new IllegalStateException("Only buld load mode is allowed for importing clinical events");
throw new IllegalStateException("Only bulk load mode is allowed for importing clinical events");
}

MySQLbulkLoader.getMySQLbulkLoader("clinical_event").insertRecord(
Expand Down Expand Up @@ -202,6 +203,26 @@ public static void deleteByCancerStudyId(int cancerStudyId) throws DaoException
JdbcUtil.closeAll(DaoClinicalEvent.class, con, pstmt, rs);
}
}

public static void deleteByPatientId(int patientId) throws DaoException {
Connection con = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
try {
con = JdbcUtil.getDbConnection(DaoClinicalEvent.class);

pstmt = con.prepareStatement("DELETE clinical_event, clinical_event_data" +
pieterlukasse marked this conversation as resolved.
Show resolved Hide resolved
" FROM clinical_event" +
" LEFT JOIN clinical_event_data ON clinical_event_data.CLINICAL_EVENT_ID = clinical_event.CLINICAL_EVENT_ID" +
" WHERE clinical_event.PATIENT_ID = ?");
pstmt.setInt(1, patientId);
pstmt.executeUpdate();
} catch (SQLException e) {
throw new DaoException(e);
} finally {
JdbcUtil.closeAll(DaoClinicalEvent.class, con, pstmt, rs);
}
}

public static void deleteAllRecords() throws DaoException {
Connection con = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -48,8 +48,10 @@
import java.io.FileReader;
import java.io.IOException;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Properties;
import java.util.Set;

/**
* Imports timeline data for display in patient view
Expand All @@ -58,7 +60,7 @@
*/
public class ImportTimelineData extends ConsoleRunnable {

private static void importData(String dataFile, int cancerStudyId) throws IOException, DaoException {
private static void importData(String dataFile, int cancerStudyId, boolean overwriteExisting) throws IOException, DaoException {
MySQLbulkLoader.bulkLoadOn();

ProgressMonitor.setCurrentMessage("Reading file " + dataFile);
Expand All @@ -81,9 +83,10 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce
throw new RuntimeException("The first line must start with\n'PATIENT_ID\tSTART_DATE\tEVENT_TYPE'\nor\n"
+ "PATIENT_ID\tSTART_DATE\tSTOP_DATE\tEVENT_TYPE");
}

long clinicalEventId = DaoClinicalEvent.getLargestClinicalEventId();

Set<Integer> processedPatientIds = new HashSet<>();

while ((line = buff.readLine()) != null) {
line = line.trim();

Expand All @@ -99,6 +102,9 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce
ProgressMonitor.logWarning("Patient " + patientId + " not found in study " + cancerStudyId + ". Skipping entry.");
continue;
}
if (overwriteExisting && processedPatientIds.add(patient.getInternalId())) {
DaoClinicalEvent.deleteByPatientId(patient.getInternalId());
}
ClinicalEvent event = new ClinicalEvent();
event.setClinicalEventId(++clinicalEventId);
event.setPatientId(patient.getInternalId());
Expand Down Expand Up @@ -128,17 +134,18 @@ private static void importData(String dataFile, int cancerStudyId) throws IOExce
public void run() {
try {
String description = "Import 'timeline' data";
OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true);

OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, false);
String dataFile = (String) options.valueOf("data");
File descriptorFile = new File((String) options.valueOf("meta"));
boolean overwriteExisting = options.has("overwrite-existing");

Properties properties = new TrimmedProperties();
properties.load(new FileInputStream(descriptorFile));

int cancerStudyInternalId = ValidationUtils.getInternalStudyId(properties.getProperty("cancer_study_identifier"));

importData(dataFile, cancerStudyInternalId);
importData(dataFile, cancerStudyInternalId, overwriteExisting);
} catch (RuntimeException e) {
throw e;
} catch (IOException|DaoException e) {
Expand Down
4 changes: 3 additions & 1 deletion src/main/java/org/mskcc/cbio/portal/util/ConsoleUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,10 @@ public static OptionSet parseStandardDataAndMetaOptions(String[] args, String de
parser.accepts( "loadMode", "direct (per record) or bulk load of data" )
.withRequiredArg().describedAs( "[directLoad|bulkLoad (default)]" ).ofType( String.class );
}
parser.accepts("overwrite-existing",
"Enables re-uploading molecular data that already exist for the given profile and sample.").withOptionalArg().describedAs("overwrite-existing").ofType(String.class);
pieterlukasse marked this conversation as resolved.
Show resolved Hide resolved
String progName = "importScript";

OptionSet options = null;
try {
options = parser.parse( args );
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
/*
* This file is part of cBioPortal.
*
* cBioPortal is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package org.mskcc.cbio.portal.integrationTest.incremental;

import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mskcc.cbio.portal.dao.DaoCancerStudy;
import org.mskcc.cbio.portal.dao.DaoClinicalEvent;
import org.mskcc.cbio.portal.dao.DaoException;
import org.mskcc.cbio.portal.dao.DaoPatient;
import org.mskcc.cbio.portal.dao.MySQLbulkLoader;
import org.mskcc.cbio.portal.model.CancerStudy;
import org.mskcc.cbio.portal.model.ClinicalEvent;
import org.mskcc.cbio.portal.model.Patient;
import org.mskcc.cbio.portal.scripts.ImportTimelineData;
import org.springframework.test.annotation.Rollback;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.transaction.annotation.Transactional;

import java.io.File;
import java.util.List;
import java.util.Map;

import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertNull;

/**
* Tests Incremental Import of Timeline Data.
*
* @author Ruslan Forostianov
* @author Pieter Lukasse
*/
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" })
@Rollback
@Transactional
public class TestIncrementalTimelineImport {

public static final String STUDY_ID = "study_tcga_pub";
private CancerStudy cancerStudy;

@Before
public void setUp() throws DaoException {
cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID);
}

@Test
public void testTimelineDataReloading() throws DaoException {
MySQLbulkLoader.bulkLoadOn();
ClinicalEvent event = new ClinicalEvent();
event.setClinicalEventId(1L);
Patient sbPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "TCGA-A1-A0SB");
event.setPatientId(sbPatient.getInternalId());
event.setStartDate(5L);
event.setEventType("SPECIMEN");
event.setEventData(Map.of("SPECIMEN_SITE", "specimen_site_to_erase"));
DaoClinicalEvent.addClinicalEvent(event);
MySQLbulkLoader.flushAll();

File singleTcgaSampleFolder = new File("src/test/resources/incremental/clinical/");
File metaFile = new File(singleTcgaSampleFolder, "meta_timeline.txt");
File dataFile = new File(singleTcgaSampleFolder, "data_timeline.txt");

ImportTimelineData importTimelineData = new ImportTimelineData(new String[] {
"--meta", metaFile.getAbsolutePath(),
"--data", dataFile.getAbsolutePath(),
"--overwrite-existing",
});
importTimelineData.run();

List<ClinicalEvent> sbClinicalEvents = DaoClinicalEvent.getClinicalEvent(sbPatient.getInternalId());
assertEquals(2, sbClinicalEvents.size());
ClinicalEvent sbSpecimen = sbClinicalEvents.stream().filter(ce -> ce.getEventType().equals("SPECIMEN")).findFirst().get();
assertEquals(20L, sbSpecimen.getStartDate());
assertEquals(60L, sbSpecimen.getStopDate());
assertEquals(Map.of(
"SPECIMEN_SITE", "test_specimen_site_1",
"SPECIMEN_TYPE", "test_specimen_type",
"SOURCE", "test_source_3"
), sbSpecimen.getEventData());
ClinicalEvent sbStatus = sbClinicalEvents.stream().filter(ce -> ce.getEventType().equals("STATUS")).findFirst().get();
assertEquals(10L, sbStatus.getStartDate());
assertEquals(20L, sbStatus.getStopDate());
assertEquals(Map.of("SOURCE", "test_source_4"), sbStatus.getEventData());

Patient sdPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "TCGA-A1-A0SD");
List<ClinicalEvent> sdClinicalEvents = DaoClinicalEvent.getClinicalEvent(sdPatient.getInternalId());
assertEquals(1, sdClinicalEvents.size());
ClinicalEvent sdStatus = sdClinicalEvents.stream().filter(ce -> ce.getEventType().equals("STATUS")).findFirst().get();
assertEquals(45L, sdStatus.getStartDate());
assertNull(sdStatus.getStopDate());
assertEquals(Map.of("SOURCE", "test_source_2"), sdStatus.getEventData());

Patient nonexistentPatient = DaoPatient.getPatientByCancerStudyAndPatientId(cancerStudy.getInternalId(), "NONEXISTENT_PATIENT");
assertNull(nonexistentPatient);
}

}
5 changes: 5 additions & 0 deletions src/test/resources/incremental/clinical/data_timeline.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
PATIENT_ID START_DATE STOP_DATE EVENT_TYPE SPECIMEN_SITE SPECIMEN_TYPE SOURCE
TCGA-A1-A0SB 20 60 SPECIMEN test_specimen_site_1 test_specimen_type test_source_3
TCGA-A1-A0SB 10 20 STATUS test_source_4
TCGA-A1-A0SD 45 STATUS test_source_2
NONEXISTENT_PATIENT 100 200 STATUS test_source_1
4 changes: 4 additions & 0 deletions src/test/resources/incremental/clinical/meta_timeline.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cancer_study_identifier: study_tcga_pub
genetic_alteration_type: CLINICAL
datatype: TIMELINE
data_filename: data_timeline.txt
3 changes: 3 additions & 0 deletions tests/system_tests_import_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ def test_incremental_load(self, run_java, locate_jar):
'--meta', f'{data_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_methylation_hm27.txt', '--noprogress')
treatment_ic50_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing',
'--meta', f'{data_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_treatment_ic50.txt', '--noprogress')
timeline_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportTimelineData', '--overwrite-existing',
'--meta', f'{data_directory}/meta_timeline.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_timeline.txt', '--noprogress')
case_list_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds',
'--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists')

Expand All @@ -130,6 +132,7 @@ def test_incremental_load(self, run_java, locate_jar):
expression_median_call,
methylation_hm27_call,
treatment_ic50_call,
timeline_call,
case_list_call,
])

Expand Down
4 changes: 4 additions & 0 deletions tests/test_data/study_es_0_inc/data_timeline.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
PATIENT_ID START_DATE STOP_DATE EVENT_TYPE SPECIMEN_SITE SPECIMEN_TYPE SOURCE
TCGA-BH-A18K 20 60 SPECIMEN test_specimen_site_1 test_specimen_type test_source_3
TCGA-BH-A18K 10 20 STATUS test_source_4
TCGA-BH-NEW 100 200 STATUS test_source_1
4 changes: 4 additions & 0 deletions tests/test_data/study_es_0_inc/meta_timeline.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
cancer_study_identifier: study_es_0
genetic_alteration_type: CLINICAL
datatype: TIMELINE
data_filename: data_timeline.txt
Loading