Skip to content

Commit

Permalink
Implement incremental upload for gene panel matrix
Browse files Browse the repository at this point in the history
The uploader was working in incremental manner already.
I had to add tests for those only.
I had to implement incremental upload for gene panel matrix
from differend data (CNA, Mutations) uploaders though.
  • Loading branch information
forus committed May 28, 2024
1 parent cc80e56 commit 4070e68
Show file tree
Hide file tree
Showing 25 changed files with 319 additions and 58 deletions.
1 change: 1 addition & 0 deletions scripts/importer/cbioportal_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,7 @@ class MetaFileTypes(object):
MetaFileTypes.GENERIC_ASSAY_BINARY,
MetaFileTypes.GENERIC_ASSAY_CATEGORICAL,
MetaFileTypes.TIMELINE,
MetaFileTypes.GENE_PANEL_MATRIX,
]

IMPORTER_CLASSNAME_BY_META_TYPE = {
Expand Down
63 changes: 42 additions & 21 deletions src/main/java/org/mskcc/cbio/portal/dao/DaoSampleProfile.java
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ private DaoSampleProfile() {}
private static final int NO_SUCH_PROFILE_ID = -1;
private static final String TABLE_NAME = "sample_profile";

public static int addSampleProfile(Integer sampleId, Integer geneticProfileId, Integer panelId) throws DaoException {
public static int addSampleProfile(Integer sampleId, Integer geneticProfileId, Integer panelId) throws DaoException {
if (MySQLbulkLoader.isBulkLoad()) {

// Add new record using bulk loader. Order of fields is:
Expand Down Expand Up @@ -80,27 +80,19 @@ public static int addSampleProfile(Integer sampleId, Integer geneticProfileId, I
ResultSet rs = null;

try {
if (!sampleExistsInGeneticProfile(sampleId, geneticProfileId)) {
con = JdbcUtil.getDbConnection(DaoSampleProfile.class);
pstmt = con.prepareStatement
("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`) "
+ "VALUES (?,?,?)");
pstmt.setInt(1, sampleId);
pstmt.setInt(2, geneticProfileId);
if (panelId != null) {
pstmt.setInt(3, panelId);
}
else {
pstmt.setNull(3, java.sql.Types.INTEGER);
}
return pstmt.executeUpdate();
} else {
// This should be an error, because the record already exists.
return 0;
con = JdbcUtil.getDbConnection(DaoSampleProfile.class);
pstmt = con.prepareStatement
("INSERT INTO sample_profile (`SAMPLE_ID`, `GENETIC_PROFILE_ID`, `PANEL_ID`) VALUES (?,?,?)");
pstmt.setInt(1, sampleId);
pstmt.setInt(2, geneticProfileId);
if (panelId != null) {
pstmt.setInt(3, panelId);
}
} catch (NullPointerException e) {
throw new DaoException(e);
} catch (SQLException e) {
else {
pstmt.setNull(3, java.sql.Types.INTEGER);
}
return pstmt.executeUpdate();
} catch (NullPointerException | SQLException e) {
throw new DaoException(e);
} finally {
JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs);
Expand Down Expand Up @@ -174,6 +166,35 @@ public static boolean sampleExistsInGeneticProfile(int sampleId, int geneticProf
}
}

public static Integer getPanelId(int sampleId, int geneticProfileId)
throws DaoException {
Connection con = null;
PreparedStatement pstmt = null;
ResultSet rs = null;

try {
con = JdbcUtil.getDbConnection(DaoSampleProfile.class);
pstmt = con.prepareStatement
("SELECT PANEL_ID FROM sample_profile WHERE SAMPLE_ID = ? AND GENETIC_PROFILE_ID = ?");
pstmt.setInt(1, sampleId);
pstmt.setInt(2, geneticProfileId);
rs = pstmt.executeQuery();
if (rs.next()) {
int panelId = rs.getInt(1);
if (rs.wasNull()) {
return null;
}
return panelId;
} else {
throw new NoSuchElementException("No sample_profile with SAMPLE_ID=" + sampleId + " and GENETIC_PROFILE_ID=" + geneticProfileId);
}
} catch (NoSuchElementException | SQLException e) {
throw new DaoException(e);
} finally {
JdbcUtil.closeAll(DaoSampleProfile.class, con, pstmt, rs);
}
}

public static int countSamplesInProfile(int geneticProfileId) throws DaoException {
Connection con = null;
PreparedStatement pstmt = null;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ public class ImportCnaDiscreteLongData {
private final File cnaFile;
private final int geneticProfileId;
private GeneticAlterationImporter geneticAlterationGeneImporter;
private String genePanel;
private final DaoGeneOptimized daoGene;
private CnaUtil cnaUtil;
private Set<CnaEvent.Event> existingCnaEvents = new HashSet<>();
Expand All @@ -78,6 +77,7 @@ public class ImportCnaDiscreteLongData {

private final ArrayList<SampleIdGeneticProfileId> sampleIdGeneticProfileIds = new ArrayList<>();
private ArrayList<Integer> orderedSampleList;
private final Integer genePanelId;

public ImportCnaDiscreteLongData(
File cnaFile,
Expand All @@ -97,7 +97,7 @@ public ImportCnaDiscreteLongData(
+ " has not supported datatype: "
+ geneticProfile.getDatatype());
}
this.genePanel = genePanel;
this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);
this.daoGene = daoGene;
this.updateMode = updateMode;
}
Expand Down Expand Up @@ -206,7 +206,7 @@ public void extractDataToImport(
}
throw new RuntimeException("Sample with stable id " + sampleIdStr + " is not found in the database.");
}
createSampleProfile(sample);
ensureSampleProfileExists(sample);

long entrezId = gene.getEntrezGeneId();
int sampleId = sample.getInternalId();
Expand All @@ -223,6 +223,18 @@ public void extractDataToImport(

}

private void ensureSampleProfileExists(Sample sample) throws DaoException {
if (updateMode) {
upsertSampleProfile(sample);
} else {
createSampleProfileIfNotExists(sample);
}
}

private void upsertSampleProfile(Sample sample) throws DaoException {
DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId);
}

/**
* Store all cna events related to a single gene
*/
Expand Down Expand Up @@ -338,15 +350,14 @@ private CanonicalGene getGene(
*
* @return boolean created or not
*/
public boolean createSampleProfile(
public boolean createSampleProfileIfNotExists(
Sample sample
) throws Exception {
) throws DaoException {
boolean inDatabase = DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId);
Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);
SampleIdGeneticProfileId toCreate = new SampleIdGeneticProfileId(sample.getInternalId(), geneticProfileId);
boolean isQueued = this.sampleIdGeneticProfileIds.contains(toCreate);
if (!inDatabase && !isQueued) {
DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID);
DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId);
this.sampleIdGeneticProfileIds.add(toCreate);
return true;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,20 +34,48 @@

import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.ObjectMapper;
import org.mskcc.cbio.portal.dao.*;
import org.mskcc.cbio.portal.model.*;
import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent;
import org.mskcc.cbio.portal.util.*;
import org.mskcc.cbio.maf.*;

import org.apache.commons.lang3.StringUtils;
import org.mskcc.cbio.maf.MafRecord;
import org.mskcc.cbio.maf.MafUtil;
import org.mskcc.cbio.portal.dao.DaoAlleleSpecificCopyNumber;
import org.mskcc.cbio.portal.dao.DaoCancerStudy;
import org.mskcc.cbio.portal.dao.DaoException;
import org.mskcc.cbio.portal.dao.DaoGeneOptimized;
import org.mskcc.cbio.portal.dao.DaoGeneticProfile;
import org.mskcc.cbio.portal.dao.DaoMutation;
import org.mskcc.cbio.portal.dao.DaoReferenceGenome;
import org.mskcc.cbio.portal.dao.DaoSample;
import org.mskcc.cbio.portal.dao.DaoSampleProfile;
import org.mskcc.cbio.portal.dao.MySQLbulkLoader;
import org.mskcc.cbio.portal.model.AlleleSpecificCopyNumber;
import org.mskcc.cbio.portal.model.CancerStudy;
import org.mskcc.cbio.portal.model.CanonicalGene;
import org.mskcc.cbio.portal.model.ExtendedMutation;
import org.mskcc.cbio.portal.model.ExtendedMutation.MutationEvent;
import org.mskcc.cbio.portal.model.GeneticAlterationType;
import org.mskcc.cbio.portal.model.GeneticProfile;
import org.mskcc.cbio.portal.model.Sample;
import org.mskcc.cbio.portal.util.ConsoleUtil;
import org.mskcc.cbio.portal.util.ExtendedMutationUtil;
import org.mskcc.cbio.portal.util.GeneticProfileUtil;
import org.mskcc.cbio.portal.util.GlobalProperties;
import org.mskcc.cbio.portal.util.ProgressMonitor;
import org.mskcc.cbio.portal.util.StableIdUtil;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.*;
import java.util.regex.*;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Import an extended mutation file.
Expand All @@ -59,7 +87,7 @@
* <br>
* @author Selcuk Onur Sumer
*/
public class ImportExtendedMutationData{
public class ImportExtendedMutationData {

private File mutationFile;
private int geneticProfileId;
Expand All @@ -69,12 +97,13 @@ public class ImportExtendedMutationData{
private int samplesSkipped = 0;
private Set<String> sampleSet = new HashSet<String>();
private Set<String> geneSet = new HashSet<String>();
private String genePanel;
private Set<String> filteredMutations = new HashSet<String>();
private Set<String> namespaces = new HashSet<String>();
private Pattern SEQUENCE_SAMPLES_REGEX = Pattern.compile("^.*sequenced_samples:(.*)$");
private final String ASCN_NAMESPACE = "ASCN";

private final Integer genePanelId;

private final boolean overwriteExisting;

/**
Expand All @@ -89,7 +118,7 @@ public ImportExtendedMutationData(File mutationFile, int geneticProfileId, Strin
this.mutationFile = mutationFile;
this.geneticProfileId = geneticProfileId;
this.swissprotIsAccession = false;
this.genePanel = genePanel;
this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);;
this.filteredMutations = filteredMutations;

// create default MutationFilter
Expand Down Expand Up @@ -428,7 +457,7 @@ public void importData() throws IOException, DaoException {
mutations.put(mutation,mutation);
}
if(!sampleSet.contains(sample.getStableId())) {
addSampleProfileRecord(sample);
ensureSampleProfileExists(sample);
}
// update ascn object with mutation unique key details
if (ascn != null){
Expand Down Expand Up @@ -600,17 +629,28 @@ private List<Sample> getSequencedSamples(String sequencedSamplesIDList, GeneticP

private void addSampleProfileRecords(List<Sample> sequencedSamples) throws DaoException {
for (Sample sample : sequencedSamples) {
addSampleProfileRecord(sample);
ensureSampleProfileExists(sample);
}
if( MySQLbulkLoader.isBulkLoad()) {
MySQLbulkLoader.flushAll();
}
}

private void addSampleProfileRecord(Sample sample) throws DaoException {
private void ensureSampleProfileExists(Sample sample) throws DaoException {
if (overwriteExisting) {
upsertSampleProfile(sample);
} else {
createSampleProfileIfNotExists(sample);
}
}

private void upsertSampleProfile(Sample sample) throws DaoException {
DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId);
}

private void createSampleProfileIfNotExists(Sample sample) throws DaoException {
if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) {
Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);
DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID);
DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,10 @@ public void run() {
"gene panel file" ).withRequiredArg().describedAs( "meta_file.txt" ).ofType( String.class );
parser.accepts("noprogress", "this option can be given to avoid the messages regarding memory usage and % complete");

// supported by the uploader already. Added for uniformity, to do not cause error when upstream software uses this flag
parser.accepts("overwrite-existing",
"Enables re-uploading molecular data that already exist for the given profile and sample.")
.withOptionalArg().describedAs("overwrite-existing").ofType(String.class);
OptionSet options;
try {
options = parser.parse( args );
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,6 @@ public class ImportTabDelimData {
private int entriesSkipped = 0;
private int nrExtraRecords = 0;
private Set<String> arrayIdSet = new HashSet<String>();
private String genePanel;
private String genericEntityProperties;
private File pdAnnotationsFile;
private Map<Map.Entry<Integer, Long>, Map<String, String>> pdAnnotations;
Expand All @@ -104,6 +103,7 @@ public class ImportTabDelimData {

private boolean updateMode;
private ArrayList<Integer> orderedSampleList;
private final Integer genePanelId;

/**
* Constructor.
Expand Down Expand Up @@ -170,7 +170,7 @@ public ImportTabDelimData(
) {
this.dataFile = dataFile;
this.geneticProfileId = geneticProfileId;
this.genePanel = genePanel;
this.genePanelId = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);
this.updateMode = updateMode;
this.daoGene = daoGene;
this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId);
Expand Down Expand Up @@ -280,7 +280,7 @@ private void doImportData() throws IOException, DaoException {
throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(sampleIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath());
}
}
ensureSampleGeneticProfile(sample);
ensureSampleProfileExists(sample);
orderedSampleList.add(sample.getInternalId());
if (pdAnnotationsForStableSampleIds != null) {
Set<Map.Entry<String, Long>> keys = new HashSet<>(pdAnnotationsForStableSampleIds.keySet());
Expand Down Expand Up @@ -417,13 +417,21 @@ private void doImportData() throws IOException, DaoException {
}
}

private void ensureSampleGeneticProfile(Sample sample) throws DaoException {
private void ensureSampleProfileExists(Sample sample) throws DaoException {
if (updateMode) {
upsertSampleProfile(sample);
} else {
createSampleProfileIfNotExists(sample);
}
}

private void upsertSampleProfile(Sample sample) throws DaoException {
DaoSampleProfile.updateSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId);
}

private void createSampleProfileIfNotExists(Sample sample) throws DaoException {
if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) {
Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel);
if (updateMode) {
DaoSampleProfile.deleteRecords(List.of(sample.getInternalId()), List.of(geneticProfileId));
}
DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID);
DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelId);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,7 @@ public static boolean outlierExpressionSelected(HashSet<String> geneticProfileId
public static int getGenePanelId(String panelId) {
GenePanel genePanel = DaoGenePanel.getGenePanelByStableId(panelId);
if (genePanel == null) {
throw new NoSuchElementException("No gene panel with id " + genePanel);
throw new NoSuchElementException("Gene panel with id " + panelId + " not found.");
}
return genePanel.getInternalId();
}
Expand Down
Loading

0 comments on commit 4070e68

Please sign in to comment.