Skip to content

(6/7) RFC79: Implement incremental upload of structural variants data #40

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions scripts/importer/cbioportal_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ class MetaFileTypes(object):
MetaFileTypes.GENERIC_ASSAY_CATEGORICAL,
MetaFileTypes.TIMELINE,
MetaFileTypes.GENE_PANEL_MATRIX,
MetaFileTypes.STRUCTURAL_VARIANT,
]

IMPORTER_CLASSNAME_BY_META_TYPE = {
Expand Down
29 changes: 29 additions & 0 deletions src/main/java/org/mskcc/cbio/portal/dao/DaoStructuralVariant.java
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,9 @@
import java.sql.ResultSet;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;
import java.util.Set;

public class DaoStructuralVariant {

Expand Down Expand Up @@ -151,6 +153,33 @@ public static void addStructuralVariantToBulkLoader(StructuralVariant structural
}
}

public static void deleteStructuralVariants(int geneticProfileId, Set<Integer> sampleIds) throws DaoException {
Connection con = null;
PreparedStatement pstmt = null;
ResultSet rs = null;
try {
con = JdbcUtil.getDbConnection(DaoGene.class);
pstmt = con.prepareStatement("DELETE structural_variant, alteration_driver_annotation" +
" FROM structural_variant" +
" LEFT JOIN alteration_driver_annotation" +
" ON alteration_driver_annotation.GENETIC_PROFILE_ID = structural_variant.GENETIC_PROFILE_ID" +
" AND alteration_driver_annotation.SAMPLE_ID = structural_variant.SAMPLE_ID" +
" WHERE structural_variant.GENETIC_PROFILE_ID=? AND structural_variant.SAMPLE_ID IN ("
+ String.join(",", Collections.nCopies(sampleIds.size(), "?"))
+ ")");
int parameterIndex = 1;
pstmt.setInt(parameterIndex++, geneticProfileId);
for (Integer sampleId : sampleIds) {
pstmt.setInt(parameterIndex++, sampleId);
}
pstmt.executeUpdate();
} catch (SQLException e) {
throw new DaoException(e);
} finally {
JdbcUtil.closeAll(DaoGene.class, con, pstmt, rs);
}
}

public static long getLargestInternalId() throws DaoException {
Connection con = null;
PreparedStatement pstmt = null;
Expand Down
54 changes: 54 additions & 0 deletions src/main/java/org/mskcc/cbio/portal/model/StructuralVariant.java
Original file line number Diff line number Diff line change
Expand Up @@ -468,4 +468,58 @@ public String getAnnotationJson() {
public void setAnnotationJson(String annotationJson) {
this.annotationJson = annotationJson;
}

@Override
public String toString() {
return "StructuralVariant{" +
"internalId=" + internalId +
", geneticProfileId=" + geneticProfileId +
", structuralVariantId=" + structuralVariantId +
", sampleIdInternal=" + sampleIdInternal +
", sampleId='" + sampleId + '\'' +
", site1EntrezGeneId=" + site1EntrezGeneId +
", site1HugoSymbol='" + site1HugoSymbol + '\'' +
", site1EnsemblTranscriptId='" + site1EnsemblTranscriptId + '\'' +
", site1Chromosome='" + site1Chromosome + '\'' +
", site1Position=" + site1Position +
", site1Contig='" + site1Contig + '\'' +
", site1Region='" + site1Region + '\'' +
", site1RegionNumber=" + site1RegionNumber +
", site1Description='" + site1Description + '\'' +
", site2EntrezGeneId=" + site2EntrezGeneId +
", site2HugoSymbol='" + site2HugoSymbol + '\'' +
", site2EnsemblTranscriptId='" + site2EnsemblTranscriptId + '\'' +
", site2Chromosome='" + site2Chromosome + '\'' +
", site2Position=" + site2Position +
", site2Contig='" + site2Contig + '\'' +
", site2Region='" + site2Region + '\'' +
", site2RegionNumber=" + site2RegionNumber +
", site2Description='" + site2Description + '\'' +
", site2EffectOnFrame='" + site2EffectOnFrame + '\'' +
", ncbiBuild='" + ncbiBuild + '\'' +
", dnaSupport='" + dnaSupport + '\'' +
", rnaSupport='" + rnaSupport + '\'' +
", normalReadCount=" + normalReadCount +
", tumorReadCount=" + tumorReadCount +
", normalVariantCount=" + normalVariantCount +
", tumorVariantCount=" + tumorVariantCount +
", normalPairedEndReadCount=" + normalPairedEndReadCount +
", tumorPairedEndReadCount=" + tumorPairedEndReadCount +
", normalSplitReadCount=" + normalSplitReadCount +
", tumorSplitReadCount=" + tumorSplitReadCount +
", annotation='" + annotation + '\'' +
", breakpointType='" + breakpointType + '\'' +
", connectionType='" + connectionType + '\'' +
", eventInfo='" + eventInfo + '\'' +
", variantClass='" + variantClass + '\'' +
", length=" + length +
", comments='" + comments + '\'' +
", svStatus='" + svStatus + '\'' +
", driverFilter='" + driverFilter + '\'' +
", driverFilterAnn='" + driverFilterAnn + '\'' +
", driverTiersFilter='" + driverTiersFilter + '\'' +
", driverTiersFilterAnn='" + driverTiersFilterAnn + '\'' +
", annotationJson='" + annotationJson + '\'' +
'}';
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,8 @@ public void run() {
dataFile,
geneticProfile.getGeneticProfileId(),
genePanel,
namespaces
namespaces,
overwriteExisting
);
importer.importData();
} else if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,22 +46,25 @@
public class ImportStructuralVariantData {

// Initialize variables
private File structuralVariantFile;
private int geneticProfileId;
private String genePanel;
private Set<String> namespaces;
private Set<String> sampleSet = new HashSet<>();
private final File structuralVariantFile;
private final int geneticProfileId;
private final Integer genePanelId;
private final Set<String> namespaces;

private final boolean updateMode;

public ImportStructuralVariantData(
File structuralVariantFile,
int geneticProfileId,
String genePanel,
Set<String> namespaces
Set<String> namespaces,
boolean updateMode
) throws DaoException {
this.structuralVariantFile = structuralVariantFile;
this.geneticProfileId = geneticProfileId;
this.genePanel = genePanel;
this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);
this.namespaces = namespaces;
this.updateMode = updateMode;
}

public void importData() throws IOException, DaoException {
Expand All @@ -75,7 +78,7 @@ public void importData() throws IOException, DaoException {
int recordCount = 0;
// Genetic profile is read in first
GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId);
ArrayList <Integer> orderedSampleList = new ArrayList<Integer>();
Set<Integer> sampleIds = new HashSet<>();
long id = DaoStructuralVariant.getLargestInternalId();
Set<String> uniqueSVs = new HashSet<>();
while ((line = buf.readLine()) != null) {
Expand Down Expand Up @@ -175,27 +178,34 @@ public void importData() throws IOException, DaoException {
// Add structural variant
DaoStructuralVariant.addStructuralVariantToBulkLoader(structuralVariant);

// Add sample to sample profile list, which is important for gene panels
if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId) && !sampleSet.contains(sample.getStableId())) {
if (genePanel != null) {
DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, GeneticProfileUtil.getGenePanelId(genePanel));
} else {
DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, null);
}
}
sampleSet.add(sample.getStableId());
orderedSampleList.add(sample.getInternalId());
sampleIds.add(sample.getInternalId());
}
}
}
}
DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList);
// TODO the dao methods could receive a set of sample ids (like the deletion does) instead of looping
if (updateMode) {
for (Integer sampleId : sampleIds) {
DaoSampleProfile.updateSampleProfile(sampleId, geneticProfileId, genePanelId);
}
DaoStructuralVariant.deleteStructuralVariants(geneticProfileId, sampleIds);
} else {
for (Integer sampleId : sampleIds) {
createSampleProfileIfNotExists(sampleId);
}
}

buf.close();
MySQLbulkLoader.flushAll();
}

private CanonicalGene setCanonicalGene(long siteEntrezGeneId, String siteHugoSymbol, DaoGeneOptimized daoGene) {
private void createSampleProfileIfNotExists(int internalSampleId) throws DaoException {
if (!DaoSampleProfile.sampleExistsInGeneticProfile(internalSampleId, geneticProfileId)) {
DaoSampleProfile.addSampleProfile(internalSampleId, geneticProfileId, genePanelId);
}
}

private CanonicalGene setCanonicalGene(long siteEntrezGeneId, String siteHugoSymbol, DaoGeneOptimized daoGene) {
CanonicalGene siteCanonicalGene = null;

// If the Entrez Gene Id is not "NA" set the canonical gene.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
/*
* This file is part of cBioPortal.
*
* cBioPortal is free software: you can redistribute it and/or modify
* it under the terms of the GNU Affero General Public License as
* published by the Free Software Foundation, either version 3 of the
* License.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Affero General Public License for more details.
*
* You should have received a copy of the GNU Affero General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
*/

package org.mskcc.cbio.portal.integrationTest.incremental;

import org.junit.Before;
import org.junit.Test;
import org.junit.runner.RunWith;
import org.mskcc.cbio.portal.dao.DaoCancerStudy;
import org.mskcc.cbio.portal.dao.DaoException;
import org.mskcc.cbio.portal.dao.DaoGenePanel;
import org.mskcc.cbio.portal.dao.DaoGeneticProfile;
import org.mskcc.cbio.portal.dao.DaoSample;
import org.mskcc.cbio.portal.dao.DaoSampleProfile;
import org.mskcc.cbio.portal.dao.DaoStructuralVariant;
import org.mskcc.cbio.portal.dao.MySQLbulkLoader;
import org.mskcc.cbio.portal.model.CancerStudy;
import org.mskcc.cbio.portal.model.ExtendedMutation;
import org.mskcc.cbio.portal.model.GenePanel;
import org.mskcc.cbio.portal.model.GeneticProfile;
import org.mskcc.cbio.portal.model.Sample;
import org.mskcc.cbio.portal.model.StructuralVariant;
import org.mskcc.cbio.portal.scripts.ImportProfileData;
import org.springframework.test.annotation.Rollback;
import org.springframework.test.context.ContextConfiguration;
import org.springframework.test.context.junit4.SpringJUnit4ClassRunner;
import org.springframework.transaction.annotation.Transactional;

import java.io.File;
import java.util.ArrayList;
import java.util.List;
import java.util.Optional;
import java.util.Set;
import java.util.stream.Collectors;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations;

/**
* Tests Incremental Import of Structural Variants Data.
*
* @author Ruslan Forostianov
* @author Pieter Lukasse
*/
@RunWith(SpringJUnit4ClassRunner.class)
@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" })
@Rollback
@Transactional
public class TestIncrementalStructuralVariantsImport {

public static final String STUDY_ID = "study_tcga_pub";
private CancerStudy cancerStudy;

@Before
public void setUp() throws DaoException {
cancerStudy = DaoCancerStudy.getCancerStudyByStableId(STUDY_ID);
}
/**
* Test incremental upload of SV data
*/
@Test
public void testIncrementalUpload() throws DaoException {
GeneticProfile svGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_structural_variants");
assertNotNull(svGeneticProfile);
String svDataSampleId = "TCGA-A1-A0SE-01";
/**
* this sample does not have SV data attached
*/
Sample svDataSample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudy.getInternalId(), svDataSampleId);

StructuralVariant structuralVariant = new StructuralVariant();
structuralVariant.setSampleIdInternal(svDataSample.getInternalId());
structuralVariant.setGeneticProfileId(svGeneticProfile.getGeneticProfileId());
structuralVariant.setAnnotation("TESTANNOT");
structuralVariant.setDriverFilter("DRVFILTER");
structuralVariant.setSite1RegionNumber(1);
structuralVariant.setSite2RegionNumber(2);
structuralVariant.setComments("This record has to be overwritten");
DaoStructuralVariant.addStructuralVariantToBulkLoader(structuralVariant);
DaoSampleProfile.addSampleProfile(svDataSample.getInternalId(), svGeneticProfile.getGeneticProfileId(), null);
MySQLbulkLoader.flushAll();

File singleTcgaSampleFolder = new File("src/test/resources/incremental/structural_variants/");
File metaFile = new File(singleTcgaSampleFolder, "meta_structural_variants.txt");
File dataFile = new File(singleTcgaSampleFolder, "data_structural_variants.txt");

ImportProfileData importProfileData = new ImportProfileData(new String[] {
"--loadMode", "bulkLoad",
"--meta", metaFile.getAbsolutePath(),
"--data", dataFile.getAbsolutePath(),
"--overwrite-existing",
});
importProfileData.run();

List<StructuralVariant> structuralVariants = DaoStructuralVariant.getAllStructuralVariants();
assertEquals(3, structuralVariants.size());
Set.of("site1_test_desc_1", "site1_test_desc_2", "site1_test_desc_3").forEach(site1Desc -> {
Optional<StructuralVariant> osv = structuralVariants.stream()
.filter(sv -> site1Desc.equals(sv.getSite1Description())
&& sv.getSampleIdInternal() == svDataSample.getInternalId()
&& sv.getGeneticProfileId() == svGeneticProfile.getGeneticProfileId()).findFirst();
assertTrue(osv.isPresent());
assertNotNull(osv.get().getDriverFilter());
});
GenePanel genePanel = DaoGenePanel.getGenePanelByStableId("TSTGNPNLSV");
assertEquals("Sample profile has to point to TSTGNPNLSV panel",
genePanel.getInternalId(),
DaoSampleProfile.getPanelId(svDataSample.getInternalId(), svGeneticProfile.getGeneticProfileId()));
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ public void testImportStructuralVariantData() throws DaoException, IOException {

// Load test structural variants
File file = new File("src/test/resources/data_structural_variants.txt");
ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, noNamespaces);
ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, noNamespaces, false);
importer.importData();
MySQLbulkLoader.flushAll();

Expand Down Expand Up @@ -133,7 +133,7 @@ public void testImportStructuralVariantDataImportsCustomNamespacesFromTwoSamples
// Load test structural variants
File file = new File("src/test/resources/data_structural_variants.txt");
Set<String> namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2");
ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport);
ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false);
importer.importData();
MySQLbulkLoader.flushAll();

Expand All @@ -159,7 +159,7 @@ public void testImportStructuralVariantDataIgnoresUnspecifiedNamespaces() throws
// Load test structural variants
File file = new File("src/test/resources/data_structural_variants_with_unspecified_namespace.txt");
Set<String> namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2");
ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport);
ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false);
importer.importData();
MySQLbulkLoader.flushAll();

Expand All @@ -182,7 +182,7 @@ public void testImportStructuralVariantDataWithNoNamespaceData() throws DaoExcep
// Load test structural variants
File file = new File("src/test/resources/data_structural_variants_with_no_namespace_data.txt");
Set<String> namespacesToImport = newHashSet("StructVarNamespace", "StructVarNamespace2");
ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport);
ImportStructuralVariantData importer = new ImportStructuralVariantData(file, geneticProfileId, null, namespacesToImport, false);
importer.importData();
MySQLbulkLoader.flushAll();

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
Sample_Id Site1_Entrez_Gene_Id Site1_Hugo_Symbol Site1_Ensembl_Transcript_Id Site1_Region_Number Site1_Chromosome Site1_Position Site1_Region Site1_Description Site2_Entrez_Gene_Id Site2_Hugo_Symbol Site2_Ensembl_Transcript_Id Site2_Region_Number Site2_Chromosome Site2_Position Site2_Contig Site2_Region Site2_Description Site2_Effect_On_Frame NCBI_Build DNA_Support RNA_Support Normal_Read_Count Tumor_Read_Count Normal_Variant_Count Tumor_Variant_Count Normal_Paired_End_Read_Count Tumor_Paired_End_Read_Count Normal_Split_Read_Count Tumor_Split_Read_Count Annotation Breakpoint_Type Center Connection_Type Event_Info Class SV_Length Comments External_Annotation cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation SV_Status StructVarNs.column1 StructVarNs2.lorem StructVarNs.column2
TCGA-A1-A0SE-01 NA AKT1 ENST00000242365 15 7 138536968 EXON site1_test_desc_1 NA BRCA1 ENST00000288602 10 7 140482957 EXON PIEZO1-NCOA4.PIEZO1.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA PIEZO1-NCOA4.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Foo Class 4 Class annotation SOMATIC value1 ipsum value2
TCGA-A1-A0SE-01 NA AKT2 ENST00000242365 15 7 138536968 EXON site1_test_desc_2 NA BRAF ENST00000288602 10 7 140482957 EXON KIAA1549-BRAF.K16B10.COSF509_2 NA GRCh37 no yes NA 1000 NA 900 NA NA NA NA KIAA1549-BRAF.K16B10.COSF509 NA NA NA Fusion NA NA Gain-of-Function COSMIC:COSF509 Putative_Driver Test driver Class 4 Class annotation SOMATIC value1 ipsum value2
TCGA-A1-A0SE-01 NA AKT3 ENST00000344348 7 10 51582939 EXON site1_test_desc_3 NA BRCA2 ENST00000340058 12 10 43612031 EXON NCOA4-RET.N7R12_2 NA GRCh37 no yes NA 1001 NA 800 NA NA NA NA NCOA4-RET.N7R1 NA NA NA Fusion NA NA Gain-of-Function NA Putative_Passenger Test driver Class 3 Class annotation SOMATIC NA NA NA
Loading
Loading