From b2c1c21665496f77fe91addb6521e8eb674438e2 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 2 May 2024 12:02:13 +0200 Subject: [PATCH 01/27] Refactor tab delim. data importer - Calculate number of lines in the file in the loader - Remove unused imports and fields - Reuse constructors - Reuse common parsing logic in tab delimiter importer - Show full stacktrace which helps in dinding where tests errored out --- pom.xml | 3 + .../cbio/portal/dao/DaoGeneticAlteration.java | 4 +- .../portal/scripts/ImportProfileData.java | 19 +- .../portal/scripts/ImportResourceData.java | 1 - .../portal/scripts/ImportTabDelimData.java | 594 ++++++++---------- .../org/mskcc/cbio/portal/util/FileUtil.java | 60 +- .../scripts/TestImportTabDelimData.java | 29 +- 7 files changed, 299 insertions(+), 411 deletions(-) diff --git a/pom.xml b/pom.xml index c71f78a2..e858e319 100644 --- a/pom.xml +++ b/pom.xml @@ -252,6 +252,9 @@ org.apache.maven.plugins maven-surefire-plugin 2.21.0 + + false + default-test diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index 25bef125..25eca11c 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -65,12 +65,10 @@ private DaoGeneticAlteration() { * Gets Instance of Dao Object. (Singleton pattern). * * @return DaoGeneticAlteration Object. - * @throws DaoException Dao Initialization Error. */ - public static DaoGeneticAlteration getInstance() throws DaoException { + public static DaoGeneticAlteration getInstance() { if (daoGeneticAlteration == null) { daoGeneticAlteration = new DaoGeneticAlteration(); - } return daoGeneticAlteration; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index a0ffe297..e4b11844 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -54,12 +54,8 @@ public class ImportProfileData extends ConsoleRunnable { public void run() { DaoGeneOptimized daoGene; DaoGeneticAlteration daoGeneticAlteration; - try { - daoGene = DaoGeneOptimized.getInstance(); - daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - } catch (DaoException e) { - throw new RuntimeException("Could not create dao instances", e); - } + daoGene = DaoGeneOptimized.getInstance(); + daoGeneticAlteration = DaoGeneticAlteration.getInstance(); try { // Parse arguments @@ -92,8 +88,7 @@ public void run() { " --> profile id: " + geneticProfile.getGeneticProfileId() + "\n --> profile name: " + geneticProfile.getProfileName() + "\n --> genetic alteration type: " + geneticProfile.getGeneticAlterationType().name()); - ProgressMonitor.setMaxValue(numLines); - + // Check genetic alteration type if (geneticProfile.getGeneticAlterationType() == GeneticAlterationType.MUTATION_EXTENDED || geneticProfile.getGeneticAlterationType() == GeneticAlterationType.MUTATION_UNCALLED) { @@ -132,9 +127,9 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), - daoGeneticAlteration + daoGeneticAlteration, daoGene ); - genericAssayProfileImporter.importData(numLines); + genericAssayProfileImporter.importData(); } } else if( geneticProfile.getGeneticAlterationType() == GeneticAlterationType.COPY_NUMBER_ALTERATION @@ -156,13 +151,13 @@ public void run() { geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, - daoGeneticAlteration + daoGeneticAlteration, daoGene ); String pdAnnotationsFilename = geneticProfile.getOtherMetaDataField("pd_annotations_filename"); if (pdAnnotationsFilename != null && !"".equals(pdAnnotationsFilename)) { importer.setPdAnnotationsFile(new File(dataFile.getParent(), pdAnnotationsFilename)); } - importer.importData(numLines); + importer.importData(); } } catch (Exception e) { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java index d04124ba..147d59d9 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportResourceData.java @@ -7,7 +7,6 @@ import java.io.*; import joptsimple.*; import java.util.*; -import java.util.regex.*; import java.util.stream.Collectors; import org.apache.commons.collections4.map.MultiKeyMap; diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index b984abf4..81300e63 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -53,11 +53,8 @@ */ public class ImportTabDelimData { public static final String CNA_VALUE_AMPLIFICATION = "2"; - public static final String CNA_VALUE_GAIN = "1"; - public static final String CNA_VALUE_HEMIZYGOUS_DELETION = "-1"; public static final String CNA_VALUE_HOMOZYGOUS_DELETION = "-2"; public static final String CNA_VALUE_PARTIAL_DELETION = "-1.5"; - public static final String CNA_VALUE_ZERO = "0"; private HashSet importedGeneticEntitySet = new HashSet<>(); private File dataFile; private String targetLine; @@ -72,6 +69,11 @@ public class ImportTabDelimData { private Map, Map> pdAnnotations; private final GeneticAlterationImporter geneticAlterationImporter; + private int numLines; + private DaoGeneticAlteration daoGeneticAlteration; + + private DaoGeneOptimized daoGene; + /** * Constructor. * @@ -90,17 +92,11 @@ public ImportTabDelimData( int geneticProfileId, String genePanel, String genericEntityProperties, - DaoGeneticAlteration daoGeneticAlteration + DaoGeneticAlteration daoGeneticAlteration, + DaoGeneOptimized daoGene ) { - this.dataFile = dataFile; - this.targetLine = targetLine; - this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; - this.genericEntityProperties = genericEntityProperties; - this.geneticAlterationImporter = new GeneticAlterationImporter( - geneticProfileId, - daoGeneticAlteration - ); + this(dataFile, targetLine, geneticProfileId, genePanel, daoGeneticAlteration, daoGene); + this.genericEntityProperties = genericEntityProperties; } /** @@ -118,13 +114,11 @@ public ImportTabDelimData( String targetLine, int geneticProfileId, String genePanel, - DaoGeneticAlteration daoGeneticAlteration + DaoGeneticAlteration daoGeneticAlteration, + DaoGeneOptimized daoGene ) { - this.dataFile = dataFile; + this(dataFile, geneticProfileId, genePanel, daoGeneticAlteration, daoGene); this.targetLine = targetLine; - this.geneticProfileId = geneticProfileId; - this.genePanel = genePanel; - this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); } /** @@ -137,12 +131,15 @@ public ImportTabDelimData( File dataFile, int geneticProfileId, String genePanel, - DaoGeneticAlteration daoGeneticAlteration + DaoGeneticAlteration daoGeneticAlteration, + DaoGeneOptimized daoGene ) { this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; this.genePanel = genePanel; + this.daoGeneticAlteration = daoGeneticAlteration; this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); + this.daoGene = daoGene; } /** @@ -151,14 +148,20 @@ public ImportTabDelimData( * @throws IOException IO Error. * @throws DaoException Database Error. */ - public void importData(int numLines) throws IOException, DaoException { + public void importData() throws IOException, DaoException { + try { + this.numLines = FileUtil.getNumLines(dataFile); + } catch (IOException e) { + throw new RuntimeException(e); + } + ProgressMonitor.setMaxValue(numLines); geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); String headerLine = buf.readLine(); - String parts[] = headerLine.split("\t"); + String headerParts[] = headerLine.split("\t"); //Whether data regards CNA or RPPA: boolean isDiscretizedCnaProfile = geneticProfile != null @@ -166,23 +169,23 @@ public void importData(int numLines) throws IOException, DaoException { && geneticProfile.showProfileInAnalysisTab(); boolean isRppaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.PROTEIN_LEVEL - && "Composite.Element.Ref".equalsIgnoreCase(parts[0]); + && "Composite.Element.Ref".equalsIgnoreCase(headerParts[0]); boolean isGsvaProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENESET_SCORE - && parts[0].equalsIgnoreCase("geneset_id"); + && headerParts[0].equalsIgnoreCase("geneset_id"); boolean isGenericAssayProfile = geneticProfile != null && geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENERIC_ASSAY - && parts[0].equalsIgnoreCase("ENTITY_STABLE_ID"); + && headerParts[0].equalsIgnoreCase("ENTITY_STABLE_ID"); int numRecordsToAdd = 0; int samplesSkipped = 0; try { - int hugoSymbolIndex = getHugoSymbolIndex(parts); - int entrezGeneIdIndex = getEntrezGeneIdIndex(parts); - int rppaGeneRefIndex = getRppaGeneRefIndex(parts); - int genesetIdIndex = getGenesetIdIndex(parts); - int sampleStartIndex = getStartIndex(parts, hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, genesetIdIndex); - int genericAssayIdIndex = getGenericAssayIdIndex(parts); + int hugoSymbolIndex = getHugoSymbolIndex(headerParts); + int entrezGeneIdIndex = getEntrezGeneIdIndex(headerParts); + int rppaGeneRefIndex = getRppaGeneRefIndex(headerParts); + int genesetIdIndex = getGenesetIdIndex(headerParts); + int sampleStartIndex = getStartIndex(headerParts, hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, genesetIdIndex); + int genericAssayIdIndex = getGenericAssayIdIndex(headerParts); if (isRppaProfile) { if (rppaGeneRefIndex == -1) { throw new RuntimeException("Error: the following column should be present for RPPA data: Composite.Element.Ref"); @@ -200,8 +203,8 @@ public void importData(int numLines) throws IOException, DaoException { } String sampleIds[]; - sampleIds = new String[parts.length - sampleStartIndex]; - System.arraycopy(parts, sampleStartIndex, sampleIds, 0, parts.length - sampleStartIndex); + sampleIds = new String[headerParts.length - sampleStartIndex]; + System.arraycopy(headerParts, sampleStartIndex, sampleIds, 0, headerParts.length - sampleStartIndex); int nrUnknownSamplesAdded = 0; ProgressMonitor.setCurrentMessage(" --> total number of samples: " + sampleIds.length); @@ -258,12 +261,6 @@ public void importData(int numLines) throws IOException, DaoException { DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); - //Gene cache: - DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); - - //Object to insert records in the generic 'genetic_alteration' table: - DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); if (isDiscretizedCnaProfile) { @@ -277,28 +274,70 @@ public void importData(int numLines) throws IOException, DaoException { genericAssayStableIdToEntityIdMap = GenericAssayMetaUtils.buildGenericAssayStableIdToEntityIdMap(); } - int lenParts = parts.length; + int headerColumns = headerParts.length; String line = buf.readLine(); while (line != null) { + ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); boolean recordAdded = false; - // either parse line as geneset or gene for importing into 'genetic_alteration' table - if (isGsvaProfile) { - recordAdded = parseGenesetLine(line, lenParts, sampleStartIndex, genesetIdIndex, - filteredSampleIndices, daoGeneticAlteration); - } else if (isGenericAssayProfile) { - recordAdded = parseGenericAssayLine(line, lenParts, sampleStartIndex, genericAssayIdIndex, - filteredSampleIndices, daoGeneticAlteration, genericAssayStableIdToEntityIdMap); - } else { - recordAdded = parseLine(line, lenParts, sampleStartIndex, - hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, - isRppaProfile, isDiscretizedCnaProfile, - daoGene, - filteredSampleIndices, orderedSampleList, - existingCnaEvents); + if (FileUtil.isInfoLine(line)) { + String[] rowParts = line.split("\t", -1); + + if (rowParts.length > headerColumns && line.split("\t").length > headerColumns) { + ProgressMonitor.logWarning("Ignoring line with more fields (" + rowParts.length + + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); + + } else { + String sampleValues[] = ArrayUtils.subarray(rowParts, sampleStartIndex, rowParts.length > headerColumns ? headerColumns : rowParts.length); + + // trim whitespace from values + sampleValues = Stream.of(sampleValues).map(String::trim).toArray(String[]::new); + sampleValues = filterOutNormalValues(filteredSampleIndices, sampleValues); + + // either parse line as geneset or gene for importing into 'genetic_alteration' table + if (isGsvaProfile) { + String genesetId = rowParts[genesetIdIndex]; + recordAdded = saveGenesetLine(sampleValues, genesetId); + } else if (isGenericAssayProfile) { + String genericAssayId = rowParts[genericAssayIdIndex]; + recordAdded = saveGenericAssayLine(sampleValues, genericAssayId, genericAssayStableIdToEntityIdMap); + } else { + String geneSymbol = null; + if (hugoSymbolIndex != -1) { + geneSymbol = rowParts[hugoSymbolIndex]; + } + if (rppaGeneRefIndex != -1) { + geneSymbol = rowParts[rppaGeneRefIndex]; + } + if (geneSymbol != null && geneSymbol.isEmpty()) { + geneSymbol = null; + } + //get entrez + String entrez = null; + if (entrezGeneIdIndex != -1) { + entrez = rowParts[entrezGeneIdIndex]; + } + if (entrez != null && entrez.isEmpty()) { + entrez = null; + } + if (entrez != null && !entrez.matches("[0-9]+")) { + //TODO - would be better to give an exception in some cases, like negative Entrez values + ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); + } else { + String firstCellValue = rowParts[0]; + if (targetLine == null || firstCellValue.equals(targetLine)) { + recordAdded = saveLine(sampleValues, + entrez, geneSymbol, + isRppaProfile, isDiscretizedCnaProfile, orderedSampleList, + existingCnaEvents); + } + } + } + } + } // increment number of records added or entries skipped @@ -483,331 +522,226 @@ private Map, Map> readPdAnnotations(File * AMIXED0... * * - * @param line the line from the profile data file to be parsed - * @param nrColumns the number of columns, defined by the header line - * @param sampleStartIndex the index of the first column with a sample name in the header field - * @param hugoSymbolIndex the index of the column Hugo_Symbol - * @param entrezGeneIdIndex the index of the column Entrez_Gene_Id - * @param rppaGeneRefIndex the index of the column Composite.Element.Ref * @param isRppaProfile true if this is an rppa profile (i.e. alteration type is PROTEIN_LEVEL and the first column is Composite.Element.Ref) * @param isDiscretizedCnaProfile true if this is a discretized CNA profile (i.e. alteration type COPY_NUMBER_ALTERATION and showProfileInAnalysisTab is true) - * @param daoGene an instance of DaoGeneOptimized ... for use in resolving gene symbols * @param orderedSampleList a list of the internal sample ids corresponding to the sample names in the header line * @param existingCnaEvents a collection of CnaEvents, to be added to or updated during parsing of individual lines * @return true if any record was stored in genetic_alteration, else false * @throws DaoException if any DaoException is thrown while using daoGene or daoGeneticAlteration */ - private boolean parseLine(String line, int nrColumns, int sampleStartIndex, - int hugoSymbolIndex, int entrezGeneIdIndex, int rppaGeneRefIndex, - boolean isRppaProfile, boolean isDiscretizedCnaProfile, - DaoGeneOptimized daoGene, - List filteredSampleIndices, List orderedSampleList, - Set existingCnaEvents + private boolean saveLine(String[] values, + String entrez, + String geneSymbol, + boolean isRppaProfile, + boolean isDiscretizedCnaProfile, + List orderedSampleList, + Set existingCnaEvents ) throws DaoException { - //TODO: refactor this entire function - split functionality into smaller units / subroutines - boolean recordStored = false; - // Ignore lines starting with # - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); + if (isRppaProfile && geneSymbol == null) { + ProgressMonitor.logWarning("Ignoring line with no Composite.Element.REF value"); + return false; + } - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - values = filterOutNormalValues(filteredSampleIndices, values); + //If all are empty, skip line: + boolean noGeneSpecified = geneSymbol == null && entrez == null; + if (noGeneSpecified) { + ProgressMonitor.logWarning("Ignoring line with no Hugo_Symbol and no Entrez_Id"); + return false; + } - String geneSymbol = null; - if (hugoSymbolIndex != -1) { - geneSymbol = parts[hugoSymbolIndex]; - } - //RPPA: //TODO - we should split up the RPPA scenario from this code...too many if/else because of this - if (rppaGeneRefIndex != -1) { - geneSymbol = parts[rppaGeneRefIndex]; - } - if (geneSymbol != null && geneSymbol.isEmpty()) { - geneSymbol = null; + if (geneSymbol != null) { + boolean multipleGenesLine = geneSymbol.contains("///"); + if (multipleGenesLine) { + ProgressMonitor.logWarning("Ignoring gene symbol: " + geneSymbol + + " It is separated by ///. This indicates that the line contains information regarding multiple genes, and we cannot currently handle this"); + return false; } - if (isRppaProfile && geneSymbol == null) { - ProgressMonitor.logWarning("Ignoring line with no Composite.Element.REF value"); + boolean unknownGene = geneSymbol.contains("---"); + if (unknownGene) { + ProgressMonitor.logWarning("Ignoring gene symbol: " + geneSymbol + + " It is specified as ---. This indicates that the line contains information regarding an unknown gene, and we cannot currently handle this"); return false; } - //get entrez - String entrez = null; - if (entrezGeneIdIndex != -1) { - entrez = parts[entrezGeneIdIndex]; + } + + List genes; + //If rppa, parse genes from "Composite.Element.REF" column: + if (isRppaProfile) { + genes = parseRPPAGenes(geneSymbol); + } else { + genes = parseGenes(entrez, geneSymbol); + } + + //if genes still null, skip current record + if (genes == null || genes.isEmpty()) { + ProgressMonitor.logWarning("Gene with Entrez_Id " + entrez + " and gene symbol" + geneSymbol +" not found. Record will be skipped for this gene."); + return false; + } + + List genesMatchingAnAlias = Collections.emptyList(); + if (geneSymbol != null) { + genesMatchingAnAlias = daoGene.getGenesForAlias(geneSymbol); + } + + Set microRNAGenes = new HashSet<>(); + Set nonMicroRNAGenes = new HashSet<>(); + Iterator geneIterator = Stream.concat(genes.stream(), genesMatchingAnAlias.stream()).iterator(); + while (geneIterator.hasNext()) { + CanonicalGene g = geneIterator.next(); + if ("miRNA".equals(g.getType())) { + microRNAGenes.add(g); + } else { + nonMicroRNAGenes.add(g); } - if (entrez != null) { - if (entrez.isEmpty()) { - entrez = null; - } - else if (!entrez.matches("[0-9]+")) { - //TODO - would be better to give an exception in some cases, like negative Entrez values - ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); - return false; + } + if (!microRNAGenes.isEmpty()) { + // for micro rna, duplicate the data + for (CanonicalGene gene : microRNAGenes) { + if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + recordStored = true; } } - - //If all are empty, skip line: - if (geneSymbol == null && entrez == null) { - ProgressMonitor.logWarning("Ignoring line with no Hugo_Symbol and no Entrez_Id"); + if (!recordStored) { + if (nonMicroRNAGenes.isEmpty()) { + // this means that no microRNA records could not be stored + ProgressMonitor.logWarning("Could not store microRNA data"); + } else { + // this case : + // - at least one of the entrez-gene-ids was not a microRNA + // - all of the matched microRNA ids (if any) failed to be imported (presumably already imported on a prior line) + ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous (a mixture of microRNA and other types). Record will be skipped for this gene."); + } return false; + } + } else { + // none of the matched genes are type "miRNA" + if (genes.size() == 1) { + // Store all values per gene: + recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); + //only add extra CNA related records if the step above worked, otherwise skip: + if (recordStored && isDiscretizedCnaProfile) { + CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, orderedSampleList, genes)); + } } else { - if (geneSymbol != null && (geneSymbol.contains("///") || geneSymbol.contains("---"))) { - // Ignore gene IDs separated by ///. This indicates that - // the line contains information regarding multiple genes, and - // we cannot currently handle this. - // Also, ignore gene IDs that are specified as ---. This indicates - // the line contains information regarding an unknown gene, and - // we cannot currently handle this. - ProgressMonitor.logWarning("Ignoring gene ID: " + geneSymbol); - return false; + if (isRppaProfile) { // for protein data, duplicate the data + recordStored = saveRppaValues(values, geneSymbol, recordStored, genes); } else { - List genes = null; - //If rppa, parse genes from "Composite.Element.REF" column: - if (isRppaProfile) { - genes = parseRPPAGenes(geneSymbol); - if (genes == null) { - //will be null when there is a parse error in this case, so we - //can return here and avoid duplicated messages: - return false; - } - if (genes.isEmpty()) { - String gene = (geneSymbol != null) ? geneSymbol : entrez; - ProgressMonitor.logWarning("Gene not found for: [" + gene - + "]. Ignoring it " - + "and all tab-delimited data associated with it!"); - return false; - } - } else { - //try entrez: - if (entrez != null) { - CanonicalGene gene = daoGene.getGene(Long.parseLong(entrez)); - if (gene != null) { - genes = Arrays.asList(gene); - } - } - //no entrez or could not resolve by entrez, try hugo: - if ((genes == null || genes.isEmpty()) && geneSymbol != null) { - // deal with multiple symbols separate by |, use the first one - int ix = geneSymbol.indexOf("|"); - if (ix > 0) { - geneSymbol = geneSymbol.substring(0, ix); - } - genes = daoGene.getGene(geneSymbol, true); - } - //if genes still null, skip current record - if (genes == null || genes.isEmpty()) { - ProgressMonitor.logWarning("Entrez_Id " + entrez + " not found. Record will be skipped for this gene."); - return false; - } + if (!recordStored) { + // this case : + // - the hugo gene symbol was ambiguous (matched multiple entrez-gene-ids) + ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous. Record will be skipped for this gene."); } + } + } + } + return recordStored; + } - // If targetLine is specified and does not match the current line, skip the current line. - if (targetLine != null && !(parts[0].equals(targetLine))) { - return false; - } + private boolean saveRppaValues(String[] values, String geneSymbol, boolean recordStored, List genes) throws DaoException { + for (CanonicalGene gene : genes) { + if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + recordStored = true; + nrExtraRecords++; + } + } + if (recordStored) { + //skip one, to avoid double counting: + nrExtraRecords--; + } else { + // this means that RPPA could not be stored + ProgressMonitor.logWarning("Could not store RPPA data"); + } + return recordStored; + } - List genesMatchingAnAlias = Collections.emptyList(); - if (geneSymbol != null) { - genesMatchingAnAlias = daoGene.getGenesForAlias(geneSymbol); - } + private List parseGenes(String entrez, String geneSymbol) { + //try entrez: + if (entrez != null) { + CanonicalGene gene = daoGene.getGene(Long.parseLong(entrez)); + if (gene != null) { + return Arrays.asList(gene); + } + } + //no entrez or could not resolve by entrez, try hugo: + if (geneSymbol != null) { + // deal with multiple symbols separate by |, use the first one + int ix = geneSymbol.indexOf("|"); + if (ix > 0) { + geneSymbol = geneSymbol.substring(0, ix); + } + return daoGene.getGene(geneSymbol, true); + } + return List.of(); + } - Set microRNAGenes = new HashSet<>(); - Set nonMicroRNAGenes = new HashSet<>(); - Iterator geneIterator = Stream.concat(genes.stream(), genesMatchingAnAlias.stream()).iterator(); - while (geneIterator.hasNext()) { - CanonicalGene g = geneIterator.next(); - if ("miRNA".equals(g.getType())) { - microRNAGenes.add(g); - } else { - nonMicroRNAGenes.add(g); - } - } - if (!microRNAGenes.isEmpty()) { - // for micro rna, duplicate the data - for (CanonicalGene gene : microRNAGenes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { - recordStored = true; - } - } - if (!recordStored) { - if (nonMicroRNAGenes.isEmpty()) { - // this means that no microRNA records could not be stored - ProgressMonitor.logWarning("Could not store microRNA data"); - } else { - // this case : - // - at least one of the entrez-gene-ids was not a microRNA - // - all of the matched microRNA ids (if any) failed to be imported (presumably already imported on a prior line) - ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous (a mixture of microRNA and other types). Record will be skipped for this gene."); - } - return false; - } - } else { - // none of the matched genes are type "miRNA" - if (genes.size() == 1) { - List cnaEventsToAdd = new ArrayList(); - - if (isDiscretizedCnaProfile) { - long entrezGeneId = genes.get(0).getEntrezGeneId(); - for (int i = 0; i < values.length; i++) { - - // temporary solution -- change partial deletion back to full deletion. - if (values[i].equals(CNA_VALUE_PARTIAL_DELETION)) { - values[i] = CNA_VALUE_HOMOZYGOUS_DELETION; - } - if (values[i].equals(CNA_VALUE_AMPLIFICATION) - // || values[i].equals(CNA_VALUE_GAIN) >> skipping GAIN, ZERO, HEMIZYGOUS_DELETION to minimize size of dataset in DB - // || values[i].equals(CNA_VALUE_ZERO) - // || values[i].equals(CNA_VALUE_HEMIZYGOUS_DELETION) - || values[i].equals(CNA_VALUE_HOMOZYGOUS_DELETION) - ) { - Integer sampleId = orderedSampleList.get(i); - CnaEvent cnaEvent = new CnaEvent(sampleId, geneticProfileId, entrezGeneId, Short.parseShort(values[i])); - //delayed add: - AbstractMap.SimpleEntry sampleGenePair = new AbstractMap.SimpleEntry<>(sampleId, entrezGeneId); - Map pdAnnotationDetails = this.pdAnnotations.get(sampleGenePair); - if (pdAnnotationDetails != null) { - cnaEvent.setDriverFilter(pdAnnotationDetails.get("DRIVER_FILTER")); - cnaEvent.setDriverFilterAnnotation(pdAnnotationDetails.get("DRIVER_FILTER_ANNOTATION")); - cnaEvent.setDriverTiersFilter(pdAnnotationDetails.get("DRIVER_TIERS_FILTER")); - cnaEvent.setDriverTiersFilterAnnotation(pdAnnotationDetails.get("DRIVER_TIERS_FILTER_ANNOTATION")); - } - cnaEventsToAdd.add(cnaEvent); - } - } - } - // Store all values per gene: - recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); - //only add extra CNA related records if the step above worked, otherwise skip: - if (recordStored) { - CnaUtil.storeCnaEvents(existingCnaEvents, cnaEventsToAdd); - } - } else { - if (isRppaProfile) { // for protein data, duplicate the data - for (CanonicalGene gene : genes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { - recordStored = true; - nrExtraRecords++; - } - } - if (recordStored) { - //skip one, to avoid double counting: - nrExtraRecords--; - } else { - // this means that RPPA could not be stored - ProgressMonitor.logWarning("Could not store RPPA data"); - } - } else { - if (!recordStored) { - // this case : - // - the hugo gene symbol was ambiguous (matched multiple entrez-gene-ids) - ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambiguous. Record will be skipped for this gene."); - } - } - } - } + private List composeCnaEventsToAdd(String[] values, List orderedSampleList, List genes) { + List cnaEventsToAdd = new ArrayList(); + long entrezGeneId = genes.get(0).getEntrezGeneId(); + for (int i = 0; i < values.length; i++) { + + // temporary solution -- change partial deletion back to full deletion. + if (values[i].equals(CNA_VALUE_PARTIAL_DELETION)) { + values[i] = CNA_VALUE_HOMOZYGOUS_DELETION; + } + if (values[i].equals(CNA_VALUE_AMPLIFICATION) + // || values[i].equals(CNA_VALUE_GAIN) >> skipping GAIN, ZERO, HEMIZYGOUS_DELETION to minimize size of dataset in DB + // || values[i].equals(CNA_VALUE_ZERO) + // || values[i].equals(CNA_VALUE_HEMIZYGOUS_DELETION) + || values[i].equals(CNA_VALUE_HOMOZYGOUS_DELETION) + ) { + Integer sampleId = orderedSampleList.get(i); + CnaEvent cnaEvent = new CnaEvent(sampleId, geneticProfileId, entrezGeneId, Short.parseShort(values[i])); + //delayed add: + AbstractMap.SimpleEntry sampleGenePair = new AbstractMap.SimpleEntry<>(sampleId, entrezGeneId); + Map pdAnnotationDetails = this.pdAnnotations.get(sampleGenePair); + if (pdAnnotationDetails != null) { + cnaEvent.setDriverFilter(pdAnnotationDetails.get("DRIVER_FILTER")); + cnaEvent.setDriverFilterAnnotation(pdAnnotationDetails.get("DRIVER_FILTER_ANNOTATION")); + cnaEvent.setDriverTiersFilter(pdAnnotationDetails.get("DRIVER_TIERS_FILTER")); + cnaEvent.setDriverTiersFilterAnnotation(pdAnnotationDetails.get("DRIVER_TIERS_FILTER_ANNOTATION")); } + cnaEventsToAdd.add(cnaEvent); } } - return recordStored; + return cnaEventsToAdd; } /** * Parses line for gene set record and stores record in 'genetic_alteration' table. - * @param line - * @param nrColumns - * @param sampleStartIndex - * @param genesetIdIndex - * @param filteredSampleIndices - * @param daoGeneticAlteration + * @param genesetId * @return * @throws DaoException */ - private boolean parseGenesetLine(String line, int nrColumns, int sampleStartIndex, int genesetIdIndex, - List filteredSampleIndices, DaoGeneticAlteration daoGeneticAlteration) throws DaoException { + private boolean saveGenesetLine(String[] values, String genesetId) throws DaoException { boolean storedRecord = false; - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); - - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } - - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - - // trim whitespace from values - values = Stream.of(values).map(String::trim).toArray(String[]::new); - values = filterOutNormalValues(filteredSampleIndices, values); - Geneset geneset = DaoGeneset.getGenesetByExternalId(parts[genesetIdIndex]); - if (geneset != null) { - storedRecord = storeGeneticEntityGeneticAlterations(values, daoGeneticAlteration, geneset.getGeneticEntityId(), - EntityType.GENESET, geneset.getExternalId()); - } - else { - ProgressMonitor.logWarning("Geneset " + parts[genesetIdIndex] + " not found in DB. Record will be skipped."); - } + Geneset geneset = DaoGeneset.getGenesetByExternalId(genesetId); + if (geneset != null) { + storedRecord = storeGeneticEntityGeneticAlterations(values, geneset.getGeneticEntityId(), EntityType.GENESET, geneset.getExternalId()); + } + else { + ProgressMonitor.logWarning("Geneset " + genesetId + " not found in DB. Record will be skipped."); } return storedRecord; } /** * Parses line for generic assay profile record and stores record in 'genetic_alteration' table. - * @param line row from the separated-text that contains one or more values on a single sample - * @param nrColumns - * @param sampleStartIndex index of the first sample column - * @param genericAssayIdIndex index of the column that uniquely identifies a sample - * @param filteredSampleIndices - * @param daoGeneticAlteration - * @return - * @throws DaoException */ - - private boolean parseGenericAssayLine(String line, int nrColumns, int sampleStartIndex, int genericAssayIdIndex, - List filteredSampleIndices, DaoGeneticAlteration daoGeneticAlteration, Map genericAssayStableIdToEntityIdMap) throws DaoException { + private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map genericAssayStableIdToEntityIdMap) { boolean recordIsStored = false; - if (!line.startsWith("#") && line.trim().length() > 0) { - String[] parts = line.split("\t", -1); - - if (parts.length > nrColumns) { - if (line.split("\t").length > nrColumns) { - ProgressMonitor.logWarning("Ignoring line with more fields (" + parts.length - + ") than specified in the headers(" + nrColumns + "): \n" + parts[0]); - return false; - } - } + Integer entityId = genericAssayStableIdToEntityIdMap.getOrDefault(genericAssayId, null); - String values[] = (String[]) ArrayUtils.subarray(parts, sampleStartIndex, parts.length > nrColumns ? nrColumns : parts.length); - - // trim whitespace from values - values = Stream.of(values).map(String::trim).toArray(String[]::new); - values = filterOutNormalValues(filteredSampleIndices, values); - - String stableId = parts[genericAssayIdIndex]; - Integer entityId = genericAssayStableIdToEntityIdMap.getOrDefault(stableId, null); - - if (entityId == null) { - ProgressMonitor.logWarning("Generic Assay entity " + parts[genericAssayIdIndex] + " not found in DB. Record will be skipped."); - } else { - recordIsStored = storeGeneticEntityGeneticAlterations(values, daoGeneticAlteration, entityId, - EntityType.GENERIC_ASSAY, stableId); - } - - return recordIsStored; + if (entityId == null) { + ProgressMonitor.logWarning("Generic Assay entity " + genericAssayId + " not found in DB. Record will be skipped."); + } else { + recordIsStored = storeGeneticEntityGeneticAlterations(values, entityId, EntityType.GENERIC_ASSAY, genericAssayId); } return recordIsStored; @@ -816,14 +750,12 @@ private boolean parseGenericAssayLine(String line, int nrColumns, int sampleStar /** * Stores genetic alteration data for a genetic entity. * @param values - * @param daoGeneticAlteration * @param geneticEntityId - internal id for genetic entity * @param geneticEntityType - "GENE", "GENESET", "PHOSPHOPROTEIN" * @param geneticEntityName - hugo symbol for "GENE", external id for "GENESET", phospho gene name for "PHOSPHOPROTEIN" * @return boolean indicating if record was stored successfully or not */ - private boolean storeGeneticEntityGeneticAlterations(String[] values, DaoGeneticAlteration daoGeneticAlteration, - Integer geneticEntityId, EntityType geneticEntityType, String geneticEntityName) { + private boolean storeGeneticEntityGeneticAlterations(String[] values, Integer geneticEntityId, EntityType geneticEntityType, String geneticEntityName) { try { if (importedGeneticEntitySet.add(geneticEntityId)) { daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values); diff --git a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java index 744ca565..2e767618 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/FileUtil.java @@ -43,30 +43,6 @@ * @author Ethan Cerami. */ public class FileUtil { - /** - * BioPAX File Type. - */ - public static final int BIOPAX = 0; - - /** - * PSI_MI File Type. - */ - public static final int PSI_MI = 1; - - /** - * External DBs File Type. - */ - public static final int EXTERNAL_DBS = 2; - - /** - * Identifiers File Type. - */ - public static final int IDENTIFIERS = 3; - - /** - * Unknown File Type. - */ - public static final int UNKNOWN = 4; /** * Gets Number of Lines in Specified File. @@ -77,32 +53,26 @@ public class FileUtil { */ public static int getNumLines(File file) throws IOException { int numLines = 0; - FileReader reader = new FileReader(file); - BufferedReader buffered = new BufferedReader(reader); - String line = buffered.readLine(); - while (line != null) { - if (!line.startsWith("#") && line.trim().length() > 0) { - numLines++; + try (FileReader reader = new FileReader(file); BufferedReader buffered = new BufferedReader(reader)) { + String line = buffered.readLine(); + while (line != null) { + if (isInfoLine(line)) { + numLines++; + } + line = buffered.readLine(); } - line = buffered.readLine(); + return numLines; } - reader.close(); - return numLines; } /** - * Gets Next Line of Input. Filters out Empty Lines and Comments. - * - * @param buf BufferedReader Object. - * @return next line of input. - * @throws IOException Error reading input stream. + * Does line brings any information? + * e.g. blank like and comments do not + * @param line + * @return */ - public static String getNextLine(BufferedReader buf) throws IOException { - String line = buf.readLine(); - while (line != null && (line.trim().length() == 0 - || line.trim().startsWith("#"))) { - line = buf.readLine(); - } - return line; + public static boolean isInfoLine(String line) { + return !line.startsWith("#") && line.trim().length() > 0; } + } \ No newline at end of file diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java index 33779cd3..800a368e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java @@ -38,7 +38,6 @@ import org.mskcc.cbio.portal.dao.DaoCancerStudy; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; -import org.mskcc.cbio.portal.dao.DaoGeneset; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.dao.DaoPatient; @@ -48,15 +47,12 @@ import org.mskcc.cbio.portal.model.CancerStudy; import org.mskcc.cbio.portal.model.CanonicalGene; import org.mskcc.cbio.portal.model.CopyNumberStatus; -import org.mskcc.cbio.portal.model.Geneset; import org.mskcc.cbio.portal.model.GeneticAlterationType; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.model.Sample; -import org.mskcc.cbio.portal.scripts.ImportGenesetData; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.mskcc.cbio.portal.util.ConsoleUtil; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -171,9 +167,8 @@ private void runImportCnaData() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 999999207); assertEquals ("0", value); @@ -236,9 +231,8 @@ private void runImportCnaData2() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test2.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 207); assertEquals (value, "0"); @@ -321,9 +315,8 @@ private void runImportRnaData1() throws DaoException, IOException{ // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/mrna_test.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); ConsoleUtil.showMessages(); int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "DD639").getInternalId(); @@ -375,9 +368,8 @@ public void testImportmRnaData2() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_expression2.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); // check if expected warnings are given: ArrayList warnings = ProgressMonitor.getWarnings(); @@ -468,9 +460,8 @@ public void testImportRppaData() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_rppa.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance()); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + parser.importData(); ConsoleUtil.showMessages(); int sampleId = DaoSample.getSampleByCancerStudyAndSampleId(studyId, "SAMPLE1").getInternalId(); From a7aab3a686332208dcfe855afed4d41bfccb443f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 3 May 2024 16:24:07 +0200 Subject: [PATCH 02/27] Implement incremental upload of mRNA data --- .../cbio/portal/dao/DaoGeneticAlteration.java | 19 ++- .../scripts/GeneticAlterationImporter.java | 4 + .../portal/scripts/ImportProfileData.java | 2 + .../portal/scripts/ImportTabDelimData.java | 161 +++++++++++++++--- .../org/mskcc/cbio/portal/util/CnaUtil.java | 3 +- .../TestIncrementalTabDelimData.java | 139 +++++++++++++++ .../scripts/TestImportTabDelimData.java | 10 +- .../data_expression_Zscores.txt | 24 +++ 8 files changed, 325 insertions(+), 37 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java create mode 100644 src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index 25eca11c..fcc2380e 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -94,7 +94,8 @@ public int addGeneticAlterations(int geneticProfileId, long entrezGeneId, String throws DaoException { return addGeneticAlterationsForGeneticEntity(geneticProfileId, DaoGeneOptimized.getGeneticEntityId(entrezGeneId), values); } - + + // TODO inc: update instead public int addGeneticAlterationsForGeneticEntity(int geneticProfileId, int geneticEntityId, String[] values) throws DaoException { @@ -237,8 +238,8 @@ public HashMap> getGeneticAlterationMapForEntit int geneticEntityId = rs.getInt("GENETIC_ENTITY_ID"); String values = rs.getString("VALUES"); //hm.debug.. - String valueParts[] = values.split(DELIM); - for (int i=0; i getProcessedAlterationData( rs = pstmt.executeQuery(); while (rs.next()) { long entrezGeneId = DaoGeneOptimized.getEntrezGeneId(rs.getInt("GENETIC_ENTITY_ID")); - String[] values = rs.getString("VALUES").split(DELIM); + String valuesString = rs.getString("VALUES"); + if (valuesString.endsWith(DELIM)) { + valuesString = valuesString.substring(0, valuesString.length() - DELIM.length()); + } + String[] values = valuesString.split(DELIM, -1); ObjectNode datum = processor.process( entrezGeneId, values, @@ -425,17 +430,19 @@ public int getCount() throws DaoException { * Deletes all Genetic Alteration Records associated with the specified Genetic Profile ID. * * @param geneticProfileId Genetic Profile ID. + * @param geneticEntityId Genetic Entity ID. * @throws DaoException Database Error. */ - public void deleteAllRecordsInGeneticProfile(long geneticProfileId) throws DaoException { + public void deleteAllRecordsInGeneticProfile(long geneticProfileId, long geneticEntityId) throws DaoException { Connection con = null; PreparedStatement pstmt = null; ResultSet rs = null; try { con = JdbcUtil.getDbConnection(DaoGeneticAlteration.class); pstmt = con.prepareStatement("DELETE from " + - "genetic_alteration WHERE GENETIC_PROFILE_ID=?"); + "genetic_alteration WHERE GENETIC_PROFILE_ID=? and GENETIC_ENTITY_ID=?"); pstmt.setLong(1, geneticProfileId); + pstmt.setLong(2, geneticEntityId); pstmt.executeUpdate(); } catch (SQLException e) { throw new DaoException(e); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java index 32aa43f2..623b3122 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/GeneticAlterationImporter.java @@ -55,5 +55,9 @@ public boolean store( } } + public boolean isImportedAlready(CanonicalGene gene) { + return importSetOfGenes.contains(gene.getEntrezGeneId()); + } + } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index e4b11844..d34ab2cc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -127,6 +127,7 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), + false, daoGeneticAlteration, daoGene ); genericAssayProfileImporter.importData(); @@ -151,6 +152,7 @@ public void run() { geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, + false, daoGeneticAlteration, daoGene ); String pdAnnotationsFilename = geneticProfile.getOtherMetaDataField("pd_annotations_filename"); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 81300e63..76ffcb0b 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -36,6 +36,7 @@ import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; +import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -74,6 +75,11 @@ public class ImportTabDelimData { private DaoGeneOptimized daoGene; + private boolean updateMode; + private HashMap> geneticAlterationMap; + private ArrayList orderedImportedSampleList; + private ArrayList orderedSampleList; + /** * Constructor. * @@ -83,7 +89,8 @@ public class ImportTabDelimData { * @param geneticProfileId GeneticProfile ID. * @param genePanel GenePanel * @param genericEntityProperties Generic Assay Entities. - * + * @param updateMode if true, update/append data to the existing one + * * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ public ImportTabDelimData( @@ -92,10 +99,11 @@ public ImportTabDelimData( int geneticProfileId, String genePanel, String genericEntityProperties, + boolean updateMode, DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { - this(dataFile, targetLine, geneticProfileId, genePanel, daoGeneticAlteration, daoGene); + this(dataFile, targetLine, geneticProfileId, genePanel, updateMode, daoGeneticAlteration, daoGene); this.genericEntityProperties = genericEntityProperties; } @@ -106,7 +114,8 @@ public ImportTabDelimData( * @param targetLine The line we want to import. * If null, all lines are imported. * @param geneticProfileId GeneticProfile ID. - * + * @param updateMode if true, update/append data to the existing one + * * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ public ImportTabDelimData( @@ -114,16 +123,18 @@ public ImportTabDelimData( String targetLine, int geneticProfileId, String genePanel, + boolean updateMode, DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { - this(dataFile, geneticProfileId, genePanel, daoGeneticAlteration, daoGene); + this(dataFile, geneticProfileId, genePanel, updateMode, daoGeneticAlteration, daoGene); this.targetLine = targetLine; } /** * Constructor. * + * @param updateMode if true, update/append data to the existing one * @param dataFile Data File containing Copy Number Alteration, MRNA Expression Data, or protein RPPA data * @param geneticProfileId GeneticProfile ID. */ @@ -131,15 +142,18 @@ public ImportTabDelimData( File dataFile, int geneticProfileId, String genePanel, + boolean updateMode, DaoGeneticAlteration daoGeneticAlteration, DaoGeneOptimized daoGene ) { this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; this.genePanel = genePanel; + this.updateMode = updateMode; this.daoGeneticAlteration = daoGeneticAlteration; this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); this.daoGene = daoGene; + this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); } /** @@ -154,10 +168,10 @@ public void importData() throws IOException, DaoException { } catch (IOException e) { throw new RuntimeException(e); } + if (updateMode) { + geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfile.getGeneticProfileId(), null); + } ProgressMonitor.setMaxValue(numLines); - - geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); - FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); String headerLine = buf.readLine(); @@ -214,9 +228,9 @@ public void importData() throws IOException, DaoException { pdAnnotationsForStableSampleIds = readPdAnnotations(this.pdAnnotationsFile); } // link Samples to the genetic profile - ArrayList orderedSampleList = new ArrayList(); ArrayList filteredSampleIndices = new ArrayList(); this.pdAnnotations = new HashMap<>(); + this.orderedSampleList = new ArrayList<>(); for (int i = 0; i < sampleIds.length; i++) { Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), StableIdUtil.getSampleId(sampleIds[i])); @@ -231,10 +245,7 @@ public void importData() throws IOException, DaoException { throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(sampleIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath()); } } - if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { - Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); - DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); - } + ensureSampleGeneticProfile(sample); orderedSampleList.add(sample.getInternalId()); if (pdAnnotationsForStableSampleIds != null) { Set> keys = new HashSet<>(pdAnnotationsForStableSampleIds.keySet()); @@ -259,7 +270,7 @@ public void importData() throws IOException, DaoException { } ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines - 1)); - DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + saveOrderedSampleList(); //cache for data found in cna_event' table: Set existingCnaEvents = new HashSet<>(); @@ -331,7 +342,7 @@ public void importData() throws IOException, DaoException { if (targetLine == null || firstCellValue.equals(targetLine)) { recordAdded = saveLine(sampleValues, entrez, geneSymbol, - isRppaProfile, isDiscretizedCnaProfile, orderedSampleList, + isRppaProfile, isDiscretizedCnaProfile, existingCnaEvents); } } @@ -350,6 +361,7 @@ public void importData() throws IOException, DaoException { line = buf.readLine(); } + expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue(); if (MySQLbulkLoader.isBulkLoad()) { MySQLbulkLoader.flushAll(); } @@ -371,6 +383,66 @@ public void importData() throws IOException, DaoException { } } + private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { + if (updateMode) { + // Expand remaining genetic entity id rows that were not mentioned in the file + new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { + try { + String[] values = orderedImportedSampleList.stream() + .map(sampleId -> + geneticAlterationMap.get(geneticEntityId).containsKey(sampleId) ? + geneticAlterationMap.get(geneticEntityId).get(sampleId) : "") + .toArray(String[]::new); + + saveValues(geneticEntityId, values); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }); + } + } + + private void ensureSampleGeneticProfile(Sample sample) throws DaoException { + if (!DaoSampleProfile.sampleExistsInGeneticProfile(sample.getInternalId(), geneticProfileId)) { + Integer genePanelID = (genePanel == null) ? null : GeneticProfileUtil.getGenePanelId(genePanel); + if (updateMode) { + DaoSampleProfile.deleteRecords(List.of(sample.getInternalId()), List.of(geneticProfileId)); + } + DaoSampleProfile.addSampleProfile(sample.getInternalId(), geneticProfileId, genePanelID); + } + } + + private void saveOrderedSampleList() throws DaoException { + if (updateMode) { + ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(geneticProfileId); + // add all new sample ids at the end + ArrayList extendedSampleList = new ArrayList<>(savedOrderedSampleList); + List newSampleIds = orderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); + extendedSampleList.addAll(newSampleIds); + orderedImportedSampleList = orderedSampleList; + orderedSampleList = extendedSampleList; + + + DaoGeneticProfileSamples.deleteAllSamplesInGeneticProfile(geneticProfileId); + } + DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + } + + //TODO move somewhere else + private Map zip(K[] keys, V[] values) { + Map map = new HashMap<>(); + + // Check if both arrays have the same length + if (keys.length == values.length) { + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + } else { + throw new IllegalArgumentException("Arrays must be of the same length"); + } + return map; + } + private Map, Map> readPdAnnotations(File pdAnnotationsFile) { Map, Map> pdAnnotations = new HashMap<>(); BufferedReader reader; @@ -524,7 +596,6 @@ private Map, Map> readPdAnnotations(File * * @param isRppaProfile true if this is an rppa profile (i.e. alteration type is PROTEIN_LEVEL and the first column is Composite.Element.Ref) * @param isDiscretizedCnaProfile true if this is a discretized CNA profile (i.e. alteration type COPY_NUMBER_ALTERATION and showProfileInAnalysisTab is true) - * @param orderedSampleList a list of the internal sample ids corresponding to the sample names in the header line * @param existingCnaEvents a collection of CnaEvents, to be added to or updated during parsing of individual lines * @return true if any record was stored in genetic_alteration, else false * @throws DaoException if any DaoException is thrown while using daoGene or daoGeneticAlteration @@ -534,7 +605,6 @@ private boolean saveLine(String[] values, String geneSymbol, boolean isRppaProfile, boolean isDiscretizedCnaProfile, - List orderedSampleList, Set existingCnaEvents ) throws DaoException { @@ -600,7 +670,7 @@ private boolean saveLine(String[] values, if (!microRNAGenes.isEmpty()) { // for micro rna, duplicate the data for (CanonicalGene gene : microRNAGenes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + if (this.saveValues(gene, values)) { recordStored = true; } } @@ -620,14 +690,14 @@ private boolean saveLine(String[] values, // none of the matched genes are type "miRNA" if (genes.size() == 1) { // Store all values per gene: - recordStored = this.geneticAlterationImporter.store(values, genes.get(0), geneSymbol); + recordStored = this.saveValues(genes.get(0), values); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored && isDiscretizedCnaProfile) { - CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, orderedSampleList, genes)); + CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, genes)); } } else { if (isRppaProfile) { // for protein data, duplicate the data - recordStored = saveRppaValues(values, geneSymbol, recordStored, genes); + recordStored = saveRppaValues(values, recordStored, genes); } else { if (!recordStored) { // this case : @@ -640,9 +710,51 @@ private boolean saveLine(String[] values, return recordStored; } - private boolean saveRppaValues(String[] values, String geneSymbol, boolean recordStored, List genes) throws DaoException { + private boolean saveValues(CanonicalGene canonicalGene, String[] values) throws DaoException { + //TODO Think of better way. We do that to do not remove genes that contain duplicate + if (geneticAlterationImporter.isImportedAlready(canonicalGene)) { + return false; + } + if (updateMode) { + values = updateValues(canonicalGene.getGeneticEntityId(), values); + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); + } + return geneticAlterationImporter.store(values, canonicalGene, canonicalGene.getHugoGeneSymbolAllCaps()); + } + //TODO unify saveValues versions + // With update mode the last duplicate wins. It's different from the other function + private boolean saveValues(int geneticEntityId, String[] values) throws DaoException { + if (updateMode) { + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); + values = updateValues(geneticEntityId, values); + } + return daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values) > 0; + } + + private String[] updateValues(int geneticEntityId, String[] values) { + //TODO swap variables + Map sampleIdToValue = zip(orderedImportedSampleList.toArray(new Integer[0]), values); + String[] updatedSampleValues = new String[orderedSampleList.size()]; + for (int i = 0; i < orderedSampleList.size(); i++) { + updatedSampleValues[i] = ""; + int sampleId = orderedSampleList.get(i); + if (geneticAlterationMap.containsKey(geneticEntityId)) { + HashMap savedSampleIdToValue = geneticAlterationMap.get(geneticEntityId); + updatedSampleValues[i] = savedSampleIdToValue.containsKey(sampleId) ? savedSampleIdToValue.remove(sampleId): ""; + if (savedSampleIdToValue.isEmpty()) { + geneticAlterationMap.remove(geneticEntityId); + } + } + if (sampleIdToValue.containsKey(sampleId)) { + updatedSampleValues[i] = sampleIdToValue.get(sampleId); + } + } + return updatedSampleValues; + } + + private boolean saveRppaValues(String[] values, boolean recordStored, List genes) throws DaoException { for (CanonicalGene gene : genes) { - if (this.geneticAlterationImporter.store(values, gene, geneSymbol)) { + if (this.saveValues(gene, values)) { recordStored = true; nrExtraRecords++; } @@ -677,7 +789,7 @@ private List parseGenes(String entrez, String geneSymbol) { return List.of(); } - private List composeCnaEventsToAdd(String[] values, List orderedSampleList, List genes) { + private List composeCnaEventsToAdd(String[] values, List genes) { List cnaEventsToAdd = new ArrayList(); long entrezGeneId = genes.get(0).getEntrezGeneId(); for (int i = 0; i < values.length; i++) { @@ -758,8 +870,7 @@ private boolean saveGenericAssayLine(String[] values, String genericAssayId, Map private boolean storeGeneticEntityGeneticAlterations(String[] values, Integer geneticEntityId, EntityType geneticEntityType, String geneticEntityName) { try { if (importedGeneticEntitySet.add(geneticEntityId)) { - daoGeneticAlteration.addGeneticAlterationsForGeneticEntity(geneticProfile.getGeneticProfileId(), geneticEntityId, values); - return true; + return saveValues(geneticEntityId, values); } else { ProgressMonitor.logWarning("Data for genetic entity " + geneticEntityName diff --git a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java index 3cc6fd71..de7fe85a 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/util/CnaUtil.java @@ -45,6 +45,7 @@ public CnaUtil(String[] headerParts, Set namespaces) { this.namespaceColumnParser = new NamespaceColumnParser(namespaces, headerParts); } + // TODO inc: update public static void storeCnaEvents( Set existingCnaEvents, List cnaEventsToAdd @@ -53,7 +54,7 @@ public static void storeCnaEvents( if (!CNA.AMP.equals(cnaEvent.getAlteration()) && !CNA.HOMDEL.equals(cnaEvent.getAlteration())) { continue; } - + // TODO Clean cnv event // Revert PR https://github.com/cBioPortal/cbioportal-core/pull/1 breaks importer Optional existingCnaEvent = existingCnaEvents .stream() diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java new file mode 100644 index 00000000..2891373b --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -0,0 +1,139 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.dao.DaoSample; +import org.mskcc.cbio.portal.model.CancerStudy; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.model.Sample; +import org.mskcc.cbio.portal.scripts.ImportTabDelimData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; +import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; + +/** + * Tests Incremental Import of Tab Delimited Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalTabDelimData { + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } + + // Hugo_Symbol: CDK1 + static final long NEW_GENE_ENTREZ_ID = 983l; + + /** + * Gene that is part of the platform, but absent during the incremental upload + */ + // Hugo_Symbol: ARAF + static final long ABSENT_GENE_ENTREZ_ID = 369l; + static final Set TEST_ENTREZ_GENE_IDS = Set.of(10000l, 207l, 208l, 3265l, ABSENT_GENE_ENTREZ_ID, 3845l, 472l, 4893l, 672l, 673l, 675l, NEW_GENE_ENTREZ_ID); + + // stable_id: TCGA-A1-A0SB-01 + static final int NEW_SAMPLE_ID = 1; + + // stable_id: TCGA-A1-A0SD-01 + static final int UPDATED_SAMPLE_ID = 2; + static final Set TEST_SAMPLE_IDS = Set.of(NEW_SAMPLE_ID, UPDATED_SAMPLE_ID, 3, 6, 8, 9, 10, 12, 13); + + /** + * Test incremental upload of MRNA_EXPRESSION + */ + @Test + public void testMrnaExpression() throws DaoException, IOException { + /** + * Prior checks + */ + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); + assertNotNull(mrnaProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); + assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); + beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); + assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); + }); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); + + /** + * Test + */ + new ImportTabDelimData(dataFile, + mrnaProfile.getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance()).importData(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); + afterResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); + if (entrezGeneId == NEW_GENE_ENTREZ_ID || entrezGeneId == ABSENT_GENE_ENTREZ_ID) { + return; + } + sampleIdToValue.forEach((sampleId, value) -> { + if (sampleId == NEW_SAMPLE_ID || sampleId == UPDATED_SAMPLE_ID) { + return; + } + assertEquals("The associated value is not expected change associated sample id " + sampleId + " and entrez gene id " + entrezGeneId, + beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); + }); + }); + assertEquals("-0.1735", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("-0.6412", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("-1.12475", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java index 800a368e..f8bcc335 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportTabDelimData.java @@ -167,7 +167,7 @@ private void runImportCnaData() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, "Barry", geneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 999999207); @@ -231,7 +231,7 @@ private void runImportCnaData2() throws DaoException, IOException{ ProgressMonitor.setConsoleMode(false); // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/cna_test2.txt"); - ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, geneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); String value = dao.getGeneticAlteration(geneticProfileId, sample1, 207); @@ -315,7 +315,7 @@ private void runImportRnaData1() throws DaoException, IOException{ // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/mrna_test.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); ConsoleUtil.showMessages(); @@ -368,7 +368,7 @@ public void testImportmRnaData2() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_expression2.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); // check if expected warnings are given: @@ -460,7 +460,7 @@ public void testImportRppaData() throws Exception { // TBD: change this to use getResourceAsStream() File file = new File("src/test/resources/tabDelimitedData/data_rppa.txt"); addTestPatientAndSampleRecords(file); - ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); + ImportTabDelimData parser = new ImportTabDelimData(file, newGeneticProfileId, null, false, DaoGeneticAlteration.getInstance(), DaoGeneOptimized.getInstance()); parser.importData(); ConsoleUtil.showMessages(); diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt new file mode 100644 index 00000000..02652403 --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -0,0 +1,24 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 +AKT3 10000 0.6393 0.5377 +AKT1 207 0.7850 0.0426 +# The pipe and after have to be removed +AKT2|TEST 208 1.0741 0.7180 +HRAS 3265 -0.1735 -0.6412 +# This gene absent in this file, but it's still part of the profile and has to be updated +#ARAF 369 0.6393 0.5377 +KRAS 3845 0.7850 0.0426 +ATM 472 1.0741 0.7180 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 -0.1735 -0.6412 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 0.6393 0.5377 +BRAF 673 0.7850 0.0426 +# Duplicate lines should be ignored +BRAF 673 0.7851 0.0427 +BRCA2 675 1.0741 0.7180 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 -0.1735 -0.6412 +# These lines have to be skipped +/// 369 0.6393 0.5377 +--- 3845 0.7850 0.0426 + 1.0741 0.7180 From bd2d8c1c038d3258390f90553b066300f7d24741 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 10:11:40 +0200 Subject: [PATCH 03/27] Add RPPA test --- .../TestIncrementalTabDelimData.java | 55 +++++++++++++++++++ .../data_expression_Zscores.txt | 2 +- .../incremental/tab_delim_data/data_rppa.txt | 24 ++++++++ src/test/resources/seed_mini.sql | 14 +++++ 4 files changed, 94 insertions(+), 1 deletion(-) create mode 100644 src/test/resources/incremental/tab_delim_data/data_rppa.txt diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index 2891373b..f57e5031 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -136,4 +136,59 @@ public void testMrnaExpression() throws DaoException, IOException { assertEquals("-1.12475", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); } + /** + * Test incremental upload of PROTEIN_LEVEL + */ + @Test + public void testRppa() throws DaoException, IOException { + /** + * Prior checks + */ + GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); + assertNotNull(rppaProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); + assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); + beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); + assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); + }); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_rppa.txt"); + + /** + * Test + */ + new ImportTabDelimData(dataFile, + rppaProfile.getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance()).importData(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); + afterResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); + if (entrezGeneId == NEW_GENE_ENTREZ_ID || entrezGeneId == ABSENT_GENE_ENTREZ_ID) { + return; + } + sampleIdToValue.forEach((sampleId, value) -> { + if (sampleId == NEW_SAMPLE_ID || sampleId == UPDATED_SAMPLE_ID) { + return; + } + assertEquals("The associated value is not expected change associated sample id " + sampleId + " and entrez gene id " + entrezGeneId, + beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); + }); + }); + assertEquals("-0.141047088398489", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("1.61253243564957", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("-1.129", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + } + } diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 02652403..1ca71772 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -1,7 +1,7 @@ Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 AKT3 10000 0.6393 0.5377 AKT1 207 0.7850 0.0426 -# The pipe and after have to be removed +# All after the pipe has to be removed AKT2|TEST 208 1.0741 0.7180 HRAS 3265 -0.1735 -0.6412 # This gene absent in this file, but it's still part of the profile and has to be updated diff --git a/src/test/resources/incremental/tab_delim_data/data_rppa.txt b/src/test/resources/incremental/tab_delim_data/data_rppa.txt new file mode 100644 index 00000000..bc3b858a --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_rppa.txt @@ -0,0 +1,24 @@ +Composite.Element.REF TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 +AKT3|akt3 1.26122710480548 0.037186254715365 +# Multiple gene symbols joined by space +AKT1 AKT2 AKT3|akt1 1.61253243664957 -0.141077088398489 +# All after the pipe has to be removed +AKT2|TEST 5.4424238579025E-05 0.062264661774981 +HRAS|hras 0.37624053370992 0.270399126328659 +# This gene absent in this file, but it's still part of the profile and has to be updated 0.407622077164699 -0.326522823583974 +#ARAF 0.383702820778609 0.218650367364756 +KRAS|kras -0.335040546938807 0.00730643372831408 +ATM|atm 0.037186254715365 1.26122710480548 +# This line missing the entrez id and the gene has to be detected by hugo symbol 0.062264661774981 5.4424238579025E-05 +BRCA1|brca1 0.270399126328659 0.37624053370992 +BRAF|braf -0.326522823583974 0.407622077164699 +# Duplicate lines should be ignored 0.218650367364756 0.383702820778609 +BRAF|braf 0.00730643372831408 -0.335040546938807 +BRCA2|brca2 -0.141077088398489 1.61253243664957 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1|cdk1 -0.141047088398489 1.61253243564957 +# These lines have to be skipped +/// -0.335040546938807 0.00730643372831408 +--- 0.037186254715365 1.26122710480548 + 0.064 0.644 +NA|K-Ras 0.062264661774981 5.4424238579025E-05 diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index 3dfd5ff9..545c85bd 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -328,6 +328,7 @@ INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (6,'study_tcga_pub_mutations',1,'MUTATION_EXTENDED','MAF','Mutations','Mutation data from whole exome sequencing.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (7,'study_tcga_pub_structural_variants',1,'STRUCTURAL_VARIANT','SV','Structural Variants','Structural Variants test data.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (8,'study_tcga_pub_cna_long',1,'COPY_NUMBER_ALTERATION','DISCRETE_LONG','CNA values','CNA values dummy data','1'); +INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (9,'study_tcga_pub_rppa',1,'PROTEIN_LEVEL','LOG2-VALUE','RPPA values','RPPA values dummy data','0'); -- gene_panel INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_CNA_DISCRETE_LONG_FORMAT','Some test panel'); @@ -375,12 +376,25 @@ INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALU INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 672),'0.066638638,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 673),'0.020369562,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (5,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 675),'0.793930197,'); +-- RPPA +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 10000),'-0.472,1.514,0.145,-0.183,0.913,-0.665,-1.700,0.976,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 207),'-1.102,-0.243,0.018,-0.154,0.330,1.005,0.681,-0.664,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 208),'-1.221,-0.592,-0.176,-0.310,-1.198,-0.670,0.077,-0.302,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 3265),'0.061,-0.055,-0.165,0.517,2.021,0.381,-0.728,0.944,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 369),'-1.129,-0.306,0.180,-0.601,0.166,0.402,0.243,-0.999,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 3845),'0.177,0.404,0.188,0.428,1.676,0.238,0.469,2.161,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 472),'-1.503,-1.925,-1.755,-1.576,-1.029,-1.401,-1.514,-2.074,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 4893),'-1.914,-2.059,-1.228,-1.322,-4.166,-1.187,0.284,-0.130,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 672),'-1.661,-1.392,-1.924,-1.656,-0.361,-1.998,-0.136,-0.709,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 673),'0.233,0.561,-0.106,-0.085,-0.012,0.143,0.141,0.609,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 675),'-0.570,-1.340,-1.544,-0.404,0.632,-1.231,0.771,-0.036,'); -- genetic_profile_samples INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (2,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (3,'2,3,6,8,9,10,12,13,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (4,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (5,'2,'); +INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (9,'2,3,6,8,9,10,12,13,'); -- patient INSERT INTO "patient" ("INTERNAL_ID","STABLE_ID","CANCER_STUDY_ID") VALUES (1,'TCGA-A1-A0SB',1); From 8b68331266359b73209da2d7114f6918bbdf232f Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 10:24:37 +0200 Subject: [PATCH 04/27] Add normal sample to thest data to test skipping --- .../data_expression_Zscores.txt | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 1ca71772..4204bf1e 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -1,24 +1,24 @@ -Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 -AKT3 10000 0.6393 0.5377 -AKT1 207 0.7850 0.0426 -# All after the pipe has to be removed -AKT2|TEST 208 1.0741 0.7180 -HRAS 3265 -0.1735 -0.6412 -# This gene absent in this file, but it's still part of the profile and has to be updated -#ARAF 369 0.6393 0.5377 -KRAS 3845 0.7850 0.0426 -ATM 472 1.0741 0.7180 -# This line missing the hugo symbol and the gene has to be detected by entrez id - 4893 -0.1735 -0.6412 -# This line missing the entrez id and the gene has to be detected by hugo symbol -BRCA1 0.6393 0.5377 -BRAF 673 0.7850 0.0426 -# Duplicate lines should be ignored -BRAF 673 0.7851 0.0427 -BRCA2 675 1.0741 0.7180 -# This gene is new! the empty values should be set for the already existing samples in the database -CDK1 983 -0.1735 -0.6412 -# These lines have to be skipped -/// 369 0.6393 0.5377 ---- 3845 0.7850 0.0426 - 1.0741 0.7180 +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL TCGA-A1-A0SD-01 +AKT3 10000 0.6393 0.1 0.5377 +AKT1 207 0.785 0.1 0.0426 +# All after the pipe has to be removed +AKT2|TEST 208 1.0741 0.1 0.718 +HRAS 3265 -0.1735 0.1 -0.6412 +# This gene absent in this file, but it's still part of the profile and has to be updated +#ARAF 369 0.6393 0.1 0.5377 +KRAS 3845 0.785 0.1 0.0426 +ATM 472 1.0741 0.1 0.718 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 -0.1735 0.1 -0.6412 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 0.6393 0.1 0.5377 +BRAF 673 0.785 0.1 0.0426 +# Duplicate lines should be ignored +BRAF 673 0.7851 0.1 0.0427 +BRCA2 675 1.0741 0.1 0.718 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 -0.1735 0.1 -0.6412 +# These lines have to be skipped +/// 369 0.6393 0.1 0.5377 +--- 3845 0.785 0.1 0.0426 + 1.0741 0.1 0.718 From b18aab11e0b706cd2ba81fd9e80dfef088fe5e68 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 10:49:38 +0200 Subject: [PATCH 05/27] Add rows with more columns then in header to skip --- .../data_expression_Zscores.txt | 22 ++++++++++++------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 4204bf1e..6764c6b1 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -1,24 +1,30 @@ Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL TCGA-A1-A0SD-01 AKT3 10000 0.6393 0.1 0.5377 AKT1 207 0.785 0.1 0.0426 -# All after the pipe has to be removed +# All after the pipe has to be removed AKT2|TEST 208 1.0741 0.1 0.718 HRAS 3265 -0.1735 0.1 -0.6412 -# This gene absent in this file, but it's still part of the profile and has to be updated +# This gene absent in this file, but it's still part of the profile and has to be updated #ARAF 369 0.6393 0.1 0.5377 KRAS 3845 0.785 0.1 0.0426 ATM 472 1.0741 0.1 0.718 -# This line missing the hugo symbol and the gene has to be detected by entrez id +# This line missing the hugo symbol and the gene has to be detected by entrez id 4893 -0.1735 0.1 -0.6412 -# This line missing the entrez id and the gene has to be detected by hugo symbol +# This line missing the entrez id and the gene has to be detected by hugo symbol BRCA1 0.6393 0.1 0.5377 BRAF 673 0.785 0.1 0.0426 -# Duplicate lines should be ignored +# Duplicate lines should be ignored BRAF 673 0.7851 0.1 0.0427 -BRCA2 675 1.0741 0.1 0.718 -# This gene is new! the empty values should be set for the already existing samples in the database +# Although this row has 2 extra columns, we are ok with that as they contain blank values +BRCA2 675 1.0741 0.1 0.718 +# This gene is new! the empty values should be set for the already existing samples in the database CDK1 983 -0.1735 0.1 -0.6412 -# These lines have to be skipped +# These lines have to be skipped +# One column too much +FGFR3 2261 0.045 0.1 0.675 0.0224575 +# Multigene sign /// 369 0.6393 0.1 0.5377 +# Unknown gene sign --- 3845 0.785 0.1 0.0426 +# Empty gene info 1.0741 0.1 0.718 From ea688c315d000b2450351cdcca94cea963f4f117 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 11:00:03 +0200 Subject: [PATCH 06/27] Skip rows that don't have enough sample columns --- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 4 +++- .../incremental/tab_delim_data/data_expression_Zscores.txt | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 76ffcb0b..5432c2ff 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -300,7 +300,9 @@ public void importData() throws IOException, DaoException { if (rowParts.length > headerColumns && line.split("\t").length > headerColumns) { ProgressMonitor.logWarning("Ignoring line with more fields (" + rowParts.length + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); - + } else if (rowParts.length < headerColumns) { + ProgressMonitor.logWarning("Ignoring line with less fields (" + rowParts.length + + ") than specified in the headers(" + headerColumns + "): \n" + rowParts[0]); } else { String sampleValues[] = ArrayUtils.subarray(rowParts, sampleStartIndex, rowParts.length > headerColumns ? headerColumns : rowParts.length); diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 6764c6b1..5c3c9012 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -22,6 +22,8 @@ CDK1 983 -0.1735 0.1 -0.6412 # These lines have to be skipped # One column too much FGFR3 2261 0.045 0.1 0.675 0.0224575 +# No sample columns +PIEZO1 9780 # Multigene sign /// 369 0.6393 0.1 0.5377 # Unknown gene sign From cdae5011077f5f4e275fc44a1c776226de63ae16 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 11:17:44 +0200 Subject: [PATCH 07/27] Test for invalid entrez id --- .../java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java | 3 ++- .../java/org/mskcc/cbio/portal/scripts/ImportGeneData.java | 2 +- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 3 +-- .../java/org/mskcc/cbio/portal/util/EntrezValidator.java | 7 +++++++ .../incremental/tab_delim_data/data_expression_Zscores.txt | 2 ++ 5 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java index 1f58acb7..3adbfb53 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneOptimized.java @@ -46,6 +46,7 @@ import java.util.regex.Matcher; import java.util.regex.Pattern; import org.mskcc.cbio.portal.model.CanonicalGene; +import org.mskcc.cbio.portal.util.EntrezValidator; import org.mskcc.cbio.portal.util.ProgressMonitor; /** @@ -322,7 +323,7 @@ public List guessGene(String geneId, String chr) { } CanonicalGene gene; - if (geneId.matches("[0-9]+")) { // likely to be a entrez gene id + if (EntrezValidator.isaValidEntrezId(geneId)) { // likely to be a entrez gene id gene = getGene(Integer.parseInt(geneId)); if (gene!=null) { return Collections.singletonList(gene); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java index 0ab8bd88..cc3300c0 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGeneData.java @@ -188,7 +188,7 @@ public static void importHGNCData(File geneFile, String genomeBuild) throws IOEx ProgressMonitor.incrementCurValue(); ConsoleUtil.showProgress(); String parts[] = line.split("\t", -1); // include trailing empty strings - if (!parts[0].matches("[0-9]+")) { + if (!EntrezValidator.isaValidEntrezId(parts[0])) { ProgressMonitor.logWarning("Skipping gene with invalid entrez gene id '" + parts[1] + "'"); continue; } diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 5432c2ff..8627dce4 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -336,8 +336,7 @@ public void importData() throws IOException, DaoException { if (entrez != null && entrez.isEmpty()) { entrez = null; } - if (entrez != null && !entrez.matches("[0-9]+")) { - //TODO - would be better to give an exception in some cases, like negative Entrez values + if (entrez != null && !EntrezValidator.isaValidEntrezId(entrez)) { ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); } else { String firstCellValue = rowParts[0]; diff --git a/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java b/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java new file mode 100644 index 00000000..335bfd66 --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/EntrezValidator.java @@ -0,0 +1,7 @@ +package org.mskcc.cbio.portal.util; + +public class EntrezValidator { + public static boolean isaValidEntrezId(String entrez) { + return entrez.matches("[0-9]+"); + } +} diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index 5c3c9012..d7b646b0 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -24,6 +24,8 @@ CDK1 983 -0.1735 0.1 -0.6412 FGFR3 2261 0.045 0.1 0.675 0.0224575 # No sample columns PIEZO1 9780 +# invalid entrez id +P2RY10 -1 0.741 0.1 0.685 # Multigene sign /// 369 0.6393 0.1 0.5377 # Unknown gene sign From cf458a4d04f036626f9e1c038d58def0aa84fd22 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 7 May 2024 11:55:01 +0200 Subject: [PATCH 08/27] Extract common code from inc. tab. delim. tests --- .../TestIncrementalTabDelimData.java | 55 +++++++------------ 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index f57e5031..a201901e 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -17,6 +17,7 @@ package org.mskcc.cbio.portal.integrationTest.incremental; +import org.jetbrains.annotations.NotNull; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; @@ -25,10 +26,7 @@ import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; -import org.mskcc.cbio.portal.dao.DaoSample; -import org.mskcc.cbio.portal.model.CancerStudy; import org.mskcc.cbio.portal.model.GeneticProfile; -import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -38,7 +36,6 @@ import java.io.File; import java.io.IOException; import java.util.HashMap; -import java.util.HashSet; import java.util.Set; import static org.junit.Assert.assertEquals; @@ -92,12 +89,7 @@ public void testMrnaExpression() throws DaoException, IOException { GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); assertNotNull(mrnaProfile); HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); - assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); - beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); - assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); - }); + assertPriorDataState(beforeResult); File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); @@ -116,20 +108,7 @@ public void testMrnaExpression() throws DaoException, IOException { * After test assertions */ HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); - afterResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); - if (entrezGeneId == NEW_GENE_ENTREZ_ID || entrezGeneId == ABSENT_GENE_ENTREZ_ID) { - return; - } - sampleIdToValue.forEach((sampleId, value) -> { - if (sampleId == NEW_SAMPLE_ID || sampleId == UPDATED_SAMPLE_ID) { - return; - } - assertEquals("The associated value is not expected change associated sample id " + sampleId + " and entrez gene id " + entrezGeneId, - beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); - }); - }); + assertNoChange(beforeResult, afterResult); assertEquals("-0.1735", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); assertEquals("-0.6412", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); @@ -147,12 +126,7 @@ public void testRppa() throws DaoException, IOException { GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); assertNotNull(rppaProfile); HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); - assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); - beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); - assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); - }); + assertPriorDataState(beforeResult); File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); File dataFile = new File(dataFolder, "data_rppa.txt"); @@ -171,6 +145,23 @@ public void testRppa() throws DaoException, IOException { * After test assertions */ HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertNoChange(beforeResult, afterResult); + assertEquals("-0.141047088398489", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("1.61253243564957", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); + assertEquals("-1.129", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + } + + private void assertPriorDataState(HashMap> beforeResult) { + assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); + assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); + beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { + assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); + assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); + }); + } + + private void assertNoChange(HashMap> beforeResult, HashMap> afterResult) { assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); afterResult.forEach((entrezGeneId, sampleIdToValue) -> { assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); @@ -185,10 +176,6 @@ public void testRppa() throws DaoException, IOException { beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); }); }); - assertEquals("-0.141047088398489", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("1.61253243564957", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); - assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("-1.129", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); } } From 9ea1adacc0bceb912e03f5b9961b2093374bf42d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 11:32:53 +0200 Subject: [PATCH 09/27] Implement incremntal upload of cna data via tab. delim. loader --- .../mskcc/cbio/portal/dao/DaoCnaEvent.java | 27 +- .../portal/scripts/ImportTabDelimData.java | 13 +- .../TestIncrementalTabDelimData.java | 253 ++++++++++++++---- .../tab_delim_data/data_cna_discrete.txt | 17 ++ .../data_cna_pd_annotations.txt | 7 + 5 files changed, 263 insertions(+), 54 deletions(-) create mode 100644 src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt create mode 100644 src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java index e7785d4f..0e4ab7e8 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java @@ -119,7 +119,32 @@ private static long addCnaEventDirectly(CnaEvent cnaEvent) throws DaoException { JdbcUtil.closeAll(DaoCnaEvent.class, con, pstmt, rs); } } - + + public static void removeSampleCnaEvents(int cnaProfileId, List sampleIds) throws DaoException { + Connection con = null; + PreparedStatement pstmt = null; + ResultSet rs = null; + try { + con = JdbcUtil.getDbConnection(DaoCnaEvent.class); + pstmt = con.prepareStatement + ("DELETE sample_cna_event, alteration_driver_annotation" + + " FROM sample_cna_event" + + " JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + + " WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" + + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + + ")"); + pstmt.setInt(1, cnaProfileId); + for (int i = 0; i < sampleIds.size(); i++) { + pstmt.setInt(i + 2, sampleIds.get(i)); + } + pstmt.executeUpdate(); + } catch (SQLException e) { + throw new DaoException(e); + } finally { + JdbcUtil.closeAll(DaoCnaEvent.class, con, pstmt, rs); + } + } + public static Map> getSamplesWithAlterations( Collection eventIds) throws DaoException { return getSamplesWithAlterations(StringUtils.join(eventIds, ",")); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 8627dce4..86c5fef8 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -36,7 +36,6 @@ import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; -import java.util.stream.Collectors; import java.util.stream.IntStream; import java.util.stream.Stream; @@ -694,7 +693,11 @@ private boolean saveLine(String[] values, recordStored = this.saveValues(genes.get(0), values); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored && isDiscretizedCnaProfile) { - CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, genes)); + if (updateMode) { + DaoCnaEvent.removeSampleCnaEvents(geneticProfileId, orderedImportedSampleList); + } + long entrezGeneId = genes.get(0).getEntrezGeneId(); + CnaUtil.storeCnaEvents(existingCnaEvents, composeCnaEventsToAdd(values, entrezGeneId)); } } else { if (isRppaProfile) { // for protein data, duplicate the data @@ -790,9 +793,11 @@ private List parseGenes(String entrez, String geneSymbol) { return List.of(); } - private List composeCnaEventsToAdd(String[] values, List genes) { + private List composeCnaEventsToAdd(String[] values, long entrezGeneId) { + if (updateMode) { + values = updateValues((int) entrezGeneId, values); + } List cnaEventsToAdd = new ArrayList(); - long entrezGeneId = genes.get(0).getEntrezGeneId(); for (int i = 0; i < values.length; i++) { // temporary solution -- change partial deletion back to full deletion. diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index a201901e..64c4f4a6 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -17,15 +17,18 @@ package org.mskcc.cbio.portal.integrationTest.incremental; -import org.jetbrains.annotations.NotNull; +import org.cbioportal.model.CNA; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticEventImpl; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.springframework.test.annotation.Rollback; @@ -35,14 +38,17 @@ import java.io.File; import java.io.IOException; +import java.util.Arrays; import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.stream.Collectors; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertTrue; -import static org.mskcc.cbio.portal.dao.DaoMutation.getMutations; /** * Tests Incremental Import of Tab Delimited Data. @@ -61,23 +67,6 @@ public void setUp() throws DaoException { DaoCancerStudy.reCacheAll(); } - // Hugo_Symbol: CDK1 - static final long NEW_GENE_ENTREZ_ID = 983l; - - /** - * Gene that is part of the platform, but absent during the incremental upload - */ - // Hugo_Symbol: ARAF - static final long ABSENT_GENE_ENTREZ_ID = 369l; - static final Set TEST_ENTREZ_GENE_IDS = Set.of(10000l, 207l, 208l, 3265l, ABSENT_GENE_ENTREZ_ID, 3845l, 472l, 4893l, 672l, 673l, 675l, NEW_GENE_ENTREZ_ID); - - // stable_id: TCGA-A1-A0SB-01 - static final int NEW_SAMPLE_ID = 1; - - // stable_id: TCGA-A1-A0SD-01 - static final int UPDATED_SAMPLE_ID = 2; - static final Set TEST_SAMPLE_IDS = Set.of(NEW_SAMPLE_ID, UPDATED_SAMPLE_ID, 3, 6, 8, 9, 10, 12, 13); - /** * Test incremental upload of MRNA_EXPRESSION */ @@ -86,10 +75,28 @@ public void testMrnaExpression() throws DaoException, IOException { /** * Prior checks */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); assertNotNull(mrnaProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); @@ -108,11 +115,18 @@ public void testMrnaExpression() throws DaoException, IOException { * After test assertions */ HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertNoChange(beforeResult, afterResult); - assertEquals("-0.1735", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("-0.6412", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); - assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("-1.12475", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + HashMap newGeneRow = afterResult.get(newGeneEntrezId); + assertEquals("-0.1735", newGeneRow.get(newSampleId)); + assertEquals("-0.6412", newGeneRow.get(updateSampleId)); + HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); + assertEquals("", absentGeneRow.get(newSampleId)); + assertEquals("-1.12475", absentGeneRow.get(updateSampleId)); } /** @@ -123,10 +137,28 @@ public void testRppa() throws DaoException, IOException { /** * Prior checks */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); assertNotNull(rppaProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); File dataFile = new File(dataFolder, "data_rppa.txt"); @@ -145,34 +177,157 @@ public void testRppa() throws DaoException, IOException { * After test assertions */ HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertNoChange(beforeResult, afterResult); - assertEquals("-0.141047088398489", afterResult.get(NEW_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("1.61253243564957", afterResult.get(NEW_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); - assertEquals("", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(NEW_SAMPLE_ID)); - assertEquals("-1.129", afterResult.get(ABSENT_GENE_ENTREZ_ID).get(UPDATED_SAMPLE_ID)); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("-1.129", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + } + + /** + * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE (gistic) + */ + @Test + public void testDiscreteCNA() throws DaoException, IOException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ATM + final long absentGeneEntrezId = 472l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-XX-0800 + final int newSampleId = 15; + // stable_id: TCGA-A1-A0SO + final int updateSampleId = 12; + final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + final Set afterSampleIds = new HashSet<>(beforeSampleIds); + afterSampleIds.add(newSampleId); + + GeneticProfile discreteCNAProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); + assertNotNull(discreteCNAProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + List allCnaLevels = Arrays.stream(CNA.values()).map(CNA::getCode).toList(); + Set beforeCnaEventsSampleIds = Set.of(4, 13, 14, updateSampleId); + List beforeSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + null, + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> beforeSampleIdToSampleCnaEvents = beforeSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals(beforeCnaEventsSampleIds, beforeSampleIdToSampleCnaEvents.keySet()); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_cna_discrete.txt"); + File pdAnnotations = new File(dataFolder, "data_cna_pd_annotations.txt"); + + /** + * Test + */ + ImportTabDelimData importer = new ImportTabDelimData(dataFile, + discreteCNAProfile.getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance()); + importer.setPdAnnotationsFile(pdAnnotations); + importer.importData(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("1", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + + List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + afterResult.keySet(), + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> afterSampleIdToSampleCnaEvents = afterSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals("There is only one new sample that has to gain cna events", beforeCnaEventsSampleIds.size() + 1, afterSampleIdToSampleCnaEvents.size()); + beforeCnaEventsSampleIds.forEach(sampleId -> { + if (sampleId == updateSampleId) { + return; + } + Set beforeCnaEvents = beforeSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + Set afterCnaEvents = afterSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + assertEquals("CNA events for sample_id=" + sampleId + " must not change.", beforeCnaEvents, afterCnaEvents); + }); + Map newSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(newSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 208l, CNA.HOMDEL, + 3265l, CNA.AMP, + 4893l, CNA.HOMDEL, + 672l, CNA.AMP, + 673l, CNA.AMP, + 675l, CNA.HOMDEL, + newGeneEntrezId, CNA.HOMDEL + ), + newSampleEntrezGeneIdToCnaAlteration); + Map updatedSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(updateSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 10000l, CNA.HOMDEL, + 207l, CNA.AMP, + 3845l, CNA.AMP, + //FIXME + //absentGeneEntrezId, CNA.HOMDEL, + 673l, CNA.HOMDEL, + newGeneEntrezId, CNA.AMP + ), + updatedSampleEntrezGeneIdToCnaAlteration); } - private void assertPriorDataState(HashMap> beforeResult) { - assertEquals("All but new entrez gene id expected to be in the database for this profile before the upload", TEST_ENTREZ_GENE_IDS.size() - 1, beforeResult.size()); - assertFalse("No new entrez gene id expected to be in the database for this profile before the upload", beforeResult.containsKey(NEW_GENE_ENTREZ_ID)); + private void assertPriorDataState(HashMap> beforeResult, Set expectedGeneEntrezIds, Set expectedSampleIds) { + assertEquals(expectedGeneEntrezIds, beforeResult.keySet()); beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("All but new sample id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", TEST_SAMPLE_IDS.size() - 1, beforeResult.get(entrezGeneId).size()); - assertFalse("No new entrez gene id expected to be in the database for this profile for gene with entrez id " + entrezGeneId + " before the upload", beforeResult.get(entrezGeneId).containsKey(NEW_SAMPLE_ID)); + assertEquals("Samples for gene with entrez_id = " + entrezGeneId + " have to match expected ones", + expectedSampleIds, beforeResult.get(entrezGeneId).keySet()); }); } - private void assertNoChange(HashMap> beforeResult, HashMap> afterResult) { - assertEquals("These genes expected to be found after upload", TEST_ENTREZ_GENE_IDS, afterResult.keySet()); - afterResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("These sample ids expected to be found for gene with entrez id " + entrezGeneId+ " after upload", TEST_SAMPLE_IDS, afterResult.get(entrezGeneId).keySet()); - if (entrezGeneId == NEW_GENE_ENTREZ_ID || entrezGeneId == ABSENT_GENE_ENTREZ_ID) { - return; - } - sampleIdToValue.forEach((sampleId, value) -> { - if (sampleId == NEW_SAMPLE_ID || sampleId == UPDATED_SAMPLE_ID) { - return; - } - assertEquals("The associated value is not expected change associated sample id " + sampleId + " and entrez gene id " + entrezGeneId, + private void assertNoChange(HashMap> beforeResult, + HashMap> afterResult, + Set geneEntrezIds, + Set sampleIds) { + geneEntrezIds.forEach(entrezGeneId -> { + assertTrue("After result is expected to contain entrez_id=" + entrezGeneId, + afterResult.containsKey(entrezGeneId)); + sampleIds.forEach(sampleId -> { + assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entrez_id=" + entrezGeneId, + afterResult.get(entrezGeneId).containsKey(sampleId)); + assertEquals("The values for sample_id=" + sampleId + + " and entrez_id=" + entrezGeneId + " before and after upload have to match.", beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); }); }); diff --git a/src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt b/src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt new file mode 100644 index 00000000..7664e868 --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt @@ -0,0 +1,17 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-XX-0800-01 TCGA-AB-CDEF-10-BLOOD_DERIVED_NORMAL TCGA-A1-A0SO-01 +AKT3 10000 0 -2 -2 +AKT1 207 -1 2 2 +# All after the pipe has to be removed +AKT2|TEST 208 -2 2 -1 +HRAS 3265 2 2 0 +KRAS 3845 0 -2 2 +# This gene absent in this file, but it's still part of the profile and has to be updated +#ATM 472 +# This line missing the hugo symbol and the gene has to be detected by entrez id + 4893 -2 -2 -1 +# This line missing the entrez id and the gene has to be detected by hugo symbol +BRCA1 2 2 0 +BRAF 673 2 -2 -2 +BRCA2 675 -1.5 2 0 +# This gene is new! the empty values should be set for the already existing samples in the database +CDK1 983 -2 -2 2 diff --git a/src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt b/src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt new file mode 100644 index 00000000..3fbcfc58 --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt @@ -0,0 +1,7 @@ +SAMPLE_ID Entrez_Gene_Id cbp_driver cbp_driver_annotation cbp_driver_tiers cbp_driver_tiers_annotation +TCGA-A1-A0SO-01 3845 Putative_Passenger Test passenger Class 2 Class annotation +TCGA-A1-A0SO-01 208 Putative_Driver Test driver Class 1 Class annotation +TCGA-A1-A0SO-01 983 Putative_Passenger Test passenger +TCGA-XX-0800-01 3845 Class 2 Class annotation +TCGA-XX-0800-01 208 Class 1 Class annotation +TCGA-XX-0800-01 983 Putative_Driver From 03f966025e5d6fa6c2cf42115e6e6d01c381e0fe Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 11:42:08 +0200 Subject: [PATCH 10/27] Blanken values for genes not mentioned in the file --- .../mskcc/cbio/portal/scripts/ImportTabDelimData.java | 8 ++------ .../incremental/TestIncrementalTabDelimData.java | 9 +++------ .../tab_delim_data/data_expression_Zscores.txt | 2 +- .../resources/incremental/tab_delim_data/data_rppa.txt | 2 +- 4 files changed, 7 insertions(+), 14 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 86c5fef8..3cfeaffc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -388,12 +388,8 @@ private void expandRemainingGeneticEntityTabDelimitedRowsWithBlankValue() { // Expand remaining genetic entity id rows that were not mentioned in the file new HashSet<>(geneticAlterationMap.keySet()).forEach(geneticEntityId -> { try { - String[] values = orderedImportedSampleList.stream() - .map(sampleId -> - geneticAlterationMap.get(geneticEntityId).containsKey(sampleId) ? - geneticAlterationMap.get(geneticEntityId).get(sampleId) : "") - .toArray(String[]::new); - + String[] values = new String[orderedImportedSampleList.size()]; + Arrays.fill(values, ""); saveValues(geneticEntityId, values); } catch (DaoException e) { throw new RuntimeException(e); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index 64c4f4a6..ddea3269 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -28,7 +28,6 @@ import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.CnaEvent; -import org.mskcc.cbio.portal.model.GeneticEventImpl; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.springframework.test.annotation.Rollback; @@ -126,7 +125,7 @@ public void testMrnaExpression() throws DaoException, IOException { assertEquals("-0.6412", newGeneRow.get(updateSampleId)); HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); assertEquals("", absentGeneRow.get(newSampleId)); - assertEquals("-1.12475", absentGeneRow.get(updateSampleId)); + assertEquals("", absentGeneRow.get(updateSampleId)); } /** @@ -186,7 +185,7 @@ public void testRppa() throws DaoException, IOException { assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); - assertEquals("-1.129", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); } /** @@ -260,7 +259,7 @@ public void testDiscreteCNA() throws DaoException, IOException { assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); - assertEquals("1", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), afterResult.keySet(), @@ -300,8 +299,6 @@ public void testDiscreteCNA() throws DaoException, IOException { 10000l, CNA.HOMDEL, 207l, CNA.AMP, 3845l, CNA.AMP, - //FIXME - //absentGeneEntrezId, CNA.HOMDEL, 673l, CNA.HOMDEL, newGeneEntrezId, CNA.AMP ), diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt index d7b646b0..dc189cec 100644 --- a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt +++ b/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt @@ -5,7 +5,7 @@ AKT1 207 0.785 0.1 0.0426 AKT2|TEST 208 1.0741 0.1 0.718 HRAS 3265 -0.1735 0.1 -0.6412 # This gene absent in this file, but it's still part of the profile and has to be updated -#ARAF 369 0.6393 0.1 0.5377 +#ARAF 369 KRAS 3845 0.785 0.1 0.0426 ATM 472 1.0741 0.1 0.718 # This line missing the hugo symbol and the gene has to be detected by entrez id diff --git a/src/test/resources/incremental/tab_delim_data/data_rppa.txt b/src/test/resources/incremental/tab_delim_data/data_rppa.txt index bc3b858a..0953ce99 100644 --- a/src/test/resources/incremental/tab_delim_data/data_rppa.txt +++ b/src/test/resources/incremental/tab_delim_data/data_rppa.txt @@ -6,7 +6,7 @@ AKT1 AKT2 AKT3|akt1 1.61253243664957 -0.141077088398489 AKT2|TEST 5.4424238579025E-05 0.062264661774981 HRAS|hras 0.37624053370992 0.270399126328659 # This gene absent in this file, but it's still part of the profile and has to be updated 0.407622077164699 -0.326522823583974 -#ARAF 0.383702820778609 0.218650367364756 +#ARAF KRAS|kras -0.335040546938807 0.00730643372831408 ATM|atm 0.037186254715365 1.26122710480548 # This line missing the entrez id and the gene has to be detected by hugo symbol 0.062264661774981 5.4424238579025E-05 From 93cc6ffa80f8c7f0e0fad42b4775be384f3d8709 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 11:52:03 +0200 Subject: [PATCH 11/27] Remove unused code --- .../org/mskcc/cbio/portal/scripts/ImportTabDelimData.java | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 3cfeaffc..fd658f0a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -219,7 +219,6 @@ public void importData() throws IOException, DaoException { sampleIds = new String[headerParts.length - sampleStartIndex]; System.arraycopy(headerParts, sampleStartIndex, sampleIds, 0, headerParts.length - sampleStartIndex); - int nrUnknownSamplesAdded = 0; ProgressMonitor.setCurrentMessage(" --> total number of samples: " + sampleIds.length); Map, Map> pdAnnotationsForStableSampleIds = null; @@ -261,9 +260,6 @@ public void importData() throws IOException, DaoException { if (pdAnnotationsForStableSampleIds != null && !pdAnnotationsForStableSampleIds.keySet().isEmpty()) { ProgressMonitor.logWarning("WARNING: Following pd annotation sample-entrezId pairs newer used in the data file: " + pdAnnotationsForStableSampleIds.keySet()); } - if (nrUnknownSamplesAdded > 0) { - ProgressMonitor.logWarning("WARNING: Number of samples added on the fly because they were missing in clinical data: " + nrUnknownSamplesAdded); - } if (samplesSkipped > 0) { ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + samplesSkipped); } From 842bcd3c9ff867667206eab7ce11f4ad78e42c62 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 14:52:19 +0200 Subject: [PATCH 12/27] Throw unsupported operation exception for GENESET_SCORE incremental upload --- .../portal/scripts/ImportTabDelimData.java | 5 +++++ .../TestIncrementalTabDelimData.java | 21 +++++++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index fd658f0a..d277cb95 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -153,6 +153,11 @@ public ImportTabDelimData( this.geneticAlterationImporter = new GeneticAlterationImporter(geneticProfileId, daoGeneticAlteration); this.daoGene = daoGene; this.geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); + if (this.updateMode + && geneticProfile != null + && this.geneticProfile.getGeneticAlterationType() == GeneticAlterationType.GENESET_SCORE) { + throw new UnsupportedOperationException("Incremental upload of geneset scores is not supported."); + } } /** diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index ddea3269..ec82dcc9 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -28,6 +28,7 @@ import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticAlterationType; import org.mskcc.cbio.portal.model.GeneticProfile; import org.mskcc.cbio.portal.scripts.ImportTabDelimData; import org.springframework.test.annotation.Rollback; @@ -47,6 +48,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; /** @@ -305,6 +307,25 @@ public void testDiscreteCNA() throws DaoException, IOException { updatedSampleEntrezGeneIdToCnaAlteration); } + @Test + public void testGsvaIsNotSupported() throws DaoException, IOException { + GeneticProfile gsvaProfile = new GeneticProfile(); + gsvaProfile.setCancerStudyId(DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub").getInternalId()); + gsvaProfile.setStableId("gsva_scores"); + gsvaProfile.setDatatype("GENESET_SCORE"); + gsvaProfile.setGeneticAlterationType(GeneticAlterationType.GENESET_SCORE); + gsvaProfile.setProfileName("gsva test platform"); + DaoGeneticProfile.addGeneticProfile(gsvaProfile); + + assertThrows(UnsupportedOperationException.class, () -> + new ImportTabDelimData(File.createTempFile("gsva", "test"), + DaoGeneticProfile.getGeneticProfileByStableId("gsva_scores").getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance())); + } + private void assertPriorDataState(HashMap> beforeResult, Set expectedGeneEntrezIds, Set expectedSampleIds) { assertEquals(expectedGeneEntrezIds, beforeResult.keySet()); beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { From 22b688aaeeab8cea4744f0e345e28cabcc5c1e60 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 16:45:57 +0200 Subject: [PATCH 13/27] Add generic assay data incremental upload test --- .../TestIncrementalTabDelimData.java | 117 +++++++++++++++--- .../tab_delim_data/data_treatment_ic50.txt | 8 ++ src/test/resources/seed_mini.sql | 12 ++ 3 files changed, 122 insertions(+), 15 deletions(-) create mode 100644 src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java index ec82dcc9..4f4b2aef 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java @@ -18,6 +18,7 @@ package org.mskcc.cbio.portal.integrationTest.incremental; import org.cbioportal.model.CNA; +import org.jetbrains.annotations.NotNull; import org.junit.Before; import org.junit.Test; import org.junit.runner.RunWith; @@ -26,6 +27,7 @@ import org.mskcc.cbio.portal.dao.DaoException; import org.mskcc.cbio.portal.dao.DaoGeneOptimized; import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; import org.mskcc.cbio.portal.dao.DaoGeneticProfile; import org.mskcc.cbio.portal.model.CnaEvent; import org.mskcc.cbio.portal.model.GeneticAlterationType; @@ -48,6 +50,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; import static org.junit.Assert.assertThrows; import static org.junit.Assert.assertTrue; @@ -326,27 +329,111 @@ public void testGsvaIsNotSupported() throws DaoException, IOException { DaoGeneOptimized.getInstance())); } - private void assertPriorDataState(HashMap> beforeResult, Set expectedGeneEntrezIds, Set expectedSampleIds) { - assertEquals(expectedGeneEntrezIds, beforeResult.keySet()); - beforeResult.forEach((entrezGeneId, sampleIdToValue) -> { - assertEquals("Samples for gene with entrez_id = " + entrezGeneId + " have to match expected ones", - expectedSampleIds, beforeResult.get(entrezGeneId).keySet()); + /** + * Test incremental upload of GENERIC_ASSAY + */ + @Test + public void testGenericAssay() throws DaoException, IOException { + /** + * Prior checks + */ + // Stable id that is part of the platform, but absent during the incremental upload + final String absentStableId = "L-685458"; + final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); + final Set beforeStableIds = new HashSet<>(noChangeStableIds); + beforeStableIds.add(absentStableId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + // stable_id: TCGA-A1-A0SE-01 + final int noChangeSampleId = 3; + final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); + + GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + assertNotNull(ic50Profile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); + assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_treatment_ic50.txt"); + + /** + * Test + */ + new ImportTabDelimData( + dataFile, + null, + ic50Profile.getGeneticProfileId(), + null, + "NAME,DESCRIPTION,URL", + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance()).importData(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + assertEquals("After result should have the same amount of entries", beforeResult.size(), afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, geneStableIdsToEntityIds(noChangeStableIds), Set.of(noChangeSampleId)); + int erlotinibEntityId = geneStableIdToEntityId("Erlotinib"); + assertEquals(">8", afterResult.get(erlotinibEntityId).get(newSampleId)); + assertEquals("7.5", afterResult.get(erlotinibEntityId).get(updateSampleId)); + int irinotecanEntityId = geneStableIdToEntityId("Irinotecan"); + assertEquals("", afterResult.get(irinotecanEntityId).get(newSampleId)); + assertEquals("0.081", afterResult.get(irinotecanEntityId).get(updateSampleId)); + int absentEntityId = geneStableIdToEntityId(absentStableId); + assertEquals("", afterResult.get(absentEntityId).get(newSampleId)); + assertEquals("", afterResult.get(absentEntityId).get(updateSampleId)); + int lapatinibEntityId = geneStableIdToEntityId("Lapatinib"); + assertEquals("6.2", afterResult.get(lapatinibEntityId).get(newSampleId)); + assertEquals("7.848", afterResult.get(lapatinibEntityId).get(updateSampleId)); + assertNull("No new generic entity has been added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); + } + + @NotNull + private Set geneStableIdsToEntityIds(Set beforeStableIds) { + return beforeStableIds.stream().map(stableId -> { + try { + return geneStableIdToEntityId(stableId); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toSet()); + } + + private int geneStableIdToEntityId(String stableId) throws DaoException { + return DaoGeneticEntity.getGeneticEntityByStableId(stableId).getId(); + } + + private void assertPriorDataState(HashMap> beforeResult, Set expectedEntityIds, Set expectedSampleIds) { + assertEquals(expectedEntityIds, beforeResult.keySet()); + beforeResult.forEach((entityId, sampleIdToValue) -> { + assertEquals("Samples for gene with entityId = " + entityId + " have to match expected ones", + expectedSampleIds, beforeResult.get(entityId).keySet()); }); } - private void assertNoChange(HashMap> beforeResult, - HashMap> afterResult, - Set geneEntrezIds, + private void assertNoChange(HashMap> beforeResult, + HashMap> afterResult, + Set entityIds, Set sampleIds) { - geneEntrezIds.forEach(entrezGeneId -> { - assertTrue("After result is expected to contain entrez_id=" + entrezGeneId, - afterResult.containsKey(entrezGeneId)); + entityIds.forEach(entityId -> { + assertTrue("After result is expected to contain entityId=" + entityId, + afterResult.containsKey(entityId)); sampleIds.forEach(sampleId -> { - assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entrez_id=" + entrezGeneId, - afterResult.get(entrezGeneId).containsKey(sampleId)); + assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entityId=" + entityId, + afterResult.get(entityId).containsKey(sampleId)); assertEquals("The values for sample_id=" + sampleId + - " and entrez_id=" + entrezGeneId + " before and after upload have to match.", - beforeResult.get(entrezGeneId).get(sampleId), afterResult.get(entrezGeneId).get(sampleId)); + " and entityId=" + entityId + " before and after upload have to match.", + beforeResult.get(entityId).get(sampleId), afterResult.get(entityId).get(sampleId)); }); }); } diff --git a/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt b/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt new file mode 100644 index 00000000..5edb7cfa --- /dev/null +++ b/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt @@ -0,0 +1,8 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB-01 TCGA-A1-A0SD-01 +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 7.5 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan 0.081 +# The database has this entity, but not the file +#L-685458 +Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib 6.2 7.848 +#The line will be skipped as the entity stable id is not in the database already +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 0.1 >~8 diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index 545c85bd..1222ce67 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -199,6 +199,11 @@ INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYP INSERT INTO "genetic_entity" ("ENTITY_TYPE") VALUES ('GENE'); SET @max_entity_id = (Select MAX(ID) from genetic_entity); INSERT INTO "gene" ("GENETIC_ENTITY_ID","ENTREZ_GENE_ID","HUGO_GENE_SYMBOL","TYPE") VALUES (@max_entity_id,2261,'FGFR3','protein-coding'); +-- Generic genetic entities +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Erlotinib'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Irinotecan'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'L-685458'); +INSERT INTO "genetic_entity" ("ENTITY_TYPE", "STABLE_ID") VALUES ('GENERIC_ASSAY', 'Lapatinib'); -- cna_event INSERT INTO "cna_event" ("CNA_EVENT_ID","ENTREZ_GENE_ID","ALTERATION") VALUES (20093,207,-2); @@ -329,6 +334,7 @@ INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (7,'study_tcga_pub_structural_variants',1,'STRUCTURAL_VARIANT','SV','Structural Variants','Structural Variants test data.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (8,'study_tcga_pub_cna_long',1,'COPY_NUMBER_ALTERATION','DISCRETE_LONG','CNA values','CNA values dummy data','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (9,'study_tcga_pub_rppa',1,'PROTEIN_LEVEL','LOG2-VALUE','RPPA values','RPPA values dummy data','0'); +INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (10,'study_tcga_pub_treatment_ic50',1,'GENERIC_ASSAY','TREATMENT_RESPONSE','test treatment values','treatment values dummy data','0'); -- gene_panel INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_CNA_DISCRETE_LONG_FORMAT','Some test panel'); @@ -388,6 +394,11 @@ INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALU INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 672),'-1.661,-1.392,-1.924,-1.656,-0.361,-1.998,-0.136,-0.709,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 673),'0.233,0.561,-0.106,-0.085,-0.012,0.143,0.141,0.609,'); INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (9,(Select "GENETIC_ENTITY_ID" from "gene" where "ENTREZ_GENE_ID" = 675),'-0.570,-1.340,-1.544,-0.404,0.632,-1.231,0.771,-0.036,'); +-- Generic assay data +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Erlotinib'),'5.2,>8,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Irinotecan'),'>8,7.1,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'L-685458'),'>4.6,7.2,'); +INSERT INTO "genetic_alteration" ("GENETIC_PROFILE_ID","GENETIC_ENTITY_ID","VALUES") VALUES (10,(Select "ID" from "genetic_entity" where "STABLE_ID" = 'Lapatinib'),'6.9,>~8,'); -- genetic_profile_samples INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (2,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); @@ -395,6 +406,7 @@ INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (4,'1,2,3,4,5,6,7,8,9,10,11,12,13,14,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (5,'2,'); INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (9,'2,3,6,8,9,10,12,13,'); +INSERT INTO "genetic_profile_samples" ("GENETIC_PROFILE_ID","ORDERED_SAMPLE_LIST") VALUES (10,'2,3,'); -- patient INSERT INTO "patient" ("INTERNAL_ID","STABLE_ID","CANCER_STUDY_ID") VALUES (1,'TCGA-A1-A0SB',1); From d11a353ff70f0a3f1370b168f6600ce90edb5d7a Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 8 May 2024 17:52:00 +0200 Subject: [PATCH 14/27] Fix integration tests --- .../portal/scripts/ImportTabDelimData.java | 22 +++++++++---------- .../dao/TestDaoGeneticProfile.java | 10 ++++----- .../TestImportCnaDiscreteLongData.java | 14 ++++++------ .../scripts/TestImportGenericAssayData.java | 5 +++-- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index d277cb95..8df2ed8b 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -667,7 +667,7 @@ private boolean saveLine(String[] values, if (!microRNAGenes.isEmpty()) { // for micro rna, duplicate the data for (CanonicalGene gene : microRNAGenes) { - if (this.saveValues(gene, values)) { + if (this.saveValues(gene, values, geneSymbol)) { recordStored = true; } } @@ -687,7 +687,7 @@ private boolean saveLine(String[] values, // none of the matched genes are type "miRNA" if (genes.size() == 1) { // Store all values per gene: - recordStored = this.saveValues(genes.get(0), values); + recordStored = this.saveValues(genes.get(0), values, geneSymbol); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored && isDiscretizedCnaProfile) { if (updateMode) { @@ -698,7 +698,7 @@ private boolean saveLine(String[] values, } } else { if (isRppaProfile) { // for protein data, duplicate the data - recordStored = saveRppaValues(values, recordStored, genes); + recordStored = saveRppaValues(values, recordStored, genes, geneSymbol); } else { if (!recordStored) { // this case : @@ -711,16 +711,14 @@ private boolean saveLine(String[] values, return recordStored; } - private boolean saveValues(CanonicalGene canonicalGene, String[] values) throws DaoException { - //TODO Think of better way. We do that to do not remove genes that contain duplicate - if (geneticAlterationImporter.isImportedAlready(canonicalGene)) { - return false; - } + private boolean saveValues(CanonicalGene canonicalGene, String[] values, String geneSymbol) throws DaoException { if (updateMode) { values = updateValues(canonicalGene.getGeneticEntityId(), values); - daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); + if (!geneticAlterationImporter.isImportedAlready(canonicalGene)) { + daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), canonicalGene.getGeneticEntityId()); + } } - return geneticAlterationImporter.store(values, canonicalGene, canonicalGene.getHugoGeneSymbolAllCaps()); + return geneticAlterationImporter.store(values, canonicalGene, geneSymbol); } //TODO unify saveValues versions // With update mode the last duplicate wins. It's different from the other function @@ -753,9 +751,9 @@ private String[] updateValues(int geneticEntityId, String[] values) { return updatedSampleValues; } - private boolean saveRppaValues(String[] values, boolean recordStored, List genes) throws DaoException { + private boolean saveRppaValues(String[] values, boolean recordStored, List genes, String geneSymbol) throws DaoException { for (CanonicalGene gene : genes) { - if (this.saveValues(gene, values)) { + if (this.saveValues(gene, values, geneSymbol)) { recordStored = true; nrExtraRecords++; } diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java index 8c1afdcc..83e04144 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/dao/TestDaoGeneticProfile.java @@ -72,7 +72,7 @@ public void setUp() throws DaoException public void testDaoGetAllGeneticProfiles() throws DaoException { ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(7, list.size()); + assertEquals(9, list.size()); } @Test @@ -134,12 +134,12 @@ public void testDaoDeleteGeneticProfile() throws DaoException { GeneticProfile geneticProfile = DaoGeneticProfile.getGeneticProfileById(2); - assertEquals(7, DaoGeneticProfile.getCount()); + assertEquals(9, DaoGeneticProfile.getCount()); DaoGeneticProfile.deleteGeneticProfile(geneticProfile); - assertEquals(6, DaoGeneticProfile.getCount()); + assertEquals(8, DaoGeneticProfile.getCount()); ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(6, list.size()); + assertEquals(8, list.size()); geneticProfile = list.get(0); assertEquals(studyId, geneticProfile.getCancerStudyId()); assertEquals("mRNA expression (microarray)", geneticProfile.getProfileName()); @@ -155,7 +155,7 @@ public void testDaoUpdateGeneticProfile() throws DaoException { geneticProfile.getGeneticProfileId(), "Updated Name", "Updated Description")); ArrayList list = DaoGeneticProfile.getAllGeneticProfiles(studyId); - assertEquals(7, list.size()); + assertEquals(9, list.size()); geneticProfile = list.get(0); assertEquals(studyId, geneticProfile.getCancerStudyId()); assertEquals("Updated Name", geneticProfile.getProfileName()); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java index d317aa03..916a16cd 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportCnaDiscreteLongData.java @@ -180,7 +180,7 @@ public void testImportCnaDiscreteLongDataAddsCnaEvents() throws Exception { @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterations() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -205,7 +205,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterations() throws Excepti @Test public void testImportCnaDiscreteLongDataAddsMissingGeneticAlterations() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_with_cna_events_missing.txt"); new ImportCnaDiscreteLongData( @@ -233,7 +233,7 @@ public void testImportCnaDiscreteLongDataAddsMissingGeneticAlterations() throws @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterationsAndProfileSamplesInCorrectOrder() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -260,7 +260,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsAndProfileSamples @Test public void testImportCnaDiscreteLongDataHandlesEntriesWithoutEntrezButWithHugo() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_without_entrez_with_hugo.txt"); new ImportCnaDiscreteLongData( @@ -283,7 +283,7 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithoutEntrezButWithHugo( @Test public void testImportCnaDiscreteLongDataHandlesEntriesWithWrongEntrezAndCorrectHugo() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test_with_wrong_entrez_and_correct_hugo.txt"); new ImportCnaDiscreteLongData( @@ -306,7 +306,7 @@ public void testImportCnaDiscreteLongDataHandlesEntriesWithWrongEntrezAndCorrect @Test public void testImportCnaDiscreteLongDataAddsGeneticAlterationsFromNonCnaEvents() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( @@ -334,7 +334,7 @@ public void testImportCnaDiscreteLongDataAddsGeneticAlterationsFromNonCnaEvents( @Test public void testImportCnaDiscreteLongDataIgnoresLineWithDuplicateGene() throws Exception { List beforeGeneticAlterations = getAllGeneticAlterations(); - assertEquals(beforeGeneticAlterations.size(), 42); + assertEquals(57, beforeGeneticAlterations.size()); File file = new File("src/test/resources/data_cna_discrete_import_test.txt"); new ImportCnaDiscreteLongData( diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java index a0a33c6d..fa7e0449 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayData.java @@ -95,10 +95,11 @@ public void testImportGenericAssayData() throws Exception { // Open mutational signature test data file File file = new File("src/test/resources/data_mutational_signature.txt"); - + int numRecordsForGenericAssayBefore = getNumRecordsForGenericAssay(); + // import data and test all mutational signatures were added ImportGenericAssayEntity.importData(file, GeneticAlterationType.GENERIC_ASSAY, "name,description", false); - assertEquals(60, getNumRecordsForGenericAssay()); + assertEquals(numRecordsForGenericAssayBefore + 60, getNumRecordsForGenericAssay()); // test wether a record can be retrieved via stable id GenericAssayMeta genericAssayMeta1 = DaoGenericAssay.getGenericAssayMetaByStableId("mean_1"); From 7dfb1bd5f6139221a8553555ac577fa54adce817 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 9 May 2024 15:19:58 +0200 Subject: [PATCH 15/27] Make tab. delimiter data uploader transactional --- .../org/mskcc/cbio/portal/dao/JdbcUtil.java | 21 +++- .../portal/scripts/ImportTabDelimData.java | 15 ++- ...estIncrementalTabDelimDataTransaction.java | 119 ++++++++++++++++++ 3 files changed, 149 insertions(+), 6 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java diff --git a/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java b/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java index 40f9e9ed..48f59d70 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/JdbcUtil.java @@ -35,10 +35,12 @@ import java.sql.*; import java.util.*; import javax.sql.DataSource; -import org.apache.commons.dbcp2.BasicDataSource; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import org.mskcc.cbio.portal.util.*; +import org.springframework.jdbc.datasource.DataSourceTransactionManager; +import org.springframework.jdbc.datasource.TransactionAwareDataSourceProxy; +import org.springframework.transaction.support.TransactionTemplate; /** * Connection Utility for JDBC. @@ -50,6 +52,8 @@ public class JdbcUtil { private static DataSource dataSource; private static Map activeConnectionCount = new HashMap(); // keep track of the number of active connection per class/requester private static final Logger LOG = LoggerFactory.getLogger(JdbcUtil.class); + private static DataSourceTransactionManager transactionManager; + private static TransactionTemplate transactionTemplate; /** * Gets the data source @@ -57,17 +61,28 @@ public class JdbcUtil { */ public static DataSource getDataSource() { if (dataSource == null) { - dataSource = new JdbcDataSource(); + dataSource = new TransactionAwareDataSourceProxy(new JdbcDataSource()); + initSpringTx(); } return dataSource; } + private static void initSpringTx() { + transactionManager = new DataSourceTransactionManager(dataSource); + transactionTemplate = new TransactionTemplate(transactionManager); + } + /** * Sets the data source * @param value the data source */ public static void setDataSource(DataSource value) { dataSource = value; + initSpringTx(); + } + + public static TransactionTemplate getTransactionTemplate() { + return transactionTemplate; } /** diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 8df2ed8b..4a118bcc 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -163,10 +163,19 @@ public ImportTabDelimData( /** * Import the Copy Number Alteration, mRNA Expression, protein RPPA, GSVA or generic_assay data * - * @throws IOException IO Error. - * @throws DaoException Database Error. */ - public void importData() throws IOException, DaoException { + public void importData() { + JdbcUtil.getTransactionTemplate().execute(status -> { + try { + doImportData(); + } catch (Throwable e) { + status.setRollbackOnly(); + throw new RuntimeException(e); + } + return null; + }); + } + private void doImportData() throws IOException, DaoException { try { this.numLines = FileUtil.getNumLines(dataFile); } catch (IOException e) { diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java new file mode 100644 index 00000000..f149d959 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimDataTransaction.java @@ -0,0 +1,119 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.cbioportal.model.CNA; +import org.jetbrains.annotations.NotNull; +import org.junit.Before; +import org.junit.Test; +import org.junit.jupiter.api.extension.ExtendWith; +import org.junit.runner.RunWith; +import org.mockito.junit.jupiter.MockitoExtension; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportTabDelimData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Propagation; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; +import static org.junit.Assert.assertTrue; +import static org.junit.Assert.fail; +import static org.mockito.ArgumentMatchers.anyLong; +import static org.mockito.Mockito.doNothing; +import static org.mockito.Mockito.mock; + +/** + * Tests Transaction for Incremental Import of Tab Delimited Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +public class TestIncrementalTabDelimDataTransaction { + + /** + * Test transaction + */ + @Test + @ExtendWith(MockitoExtension.class) + //Mysql does not support nested transactions. That's why we disable the outer transaction. + @Transactional(propagation = Propagation.NOT_SUPPORTED) + public void testTransaction() throws Exception { + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); + + File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); + File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + + DaoGeneticAlteration mockedDao = mock(DaoGeneticAlteration.class); + + doNothing().doNothing().doThrow(new DaoException("Simulated dao error")) + .when(mockedDao).deleteAllRecordsInGeneticProfile(anyLong(), anyLong()); + /** + * Test + */ + try { + new ImportTabDelimData(dataFile, + mrnaProfile.getGeneticProfileId(), + null, + true, + mockedDao, + DaoGeneOptimized.getInstance()).importData(); + fail("Import has to fail"); + } catch (RuntimeException runtimeException) { + assertTrue(true); + } + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals(beforeResult, afterResult); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } +} From 71cdf704ae1336b16d8034c6479caac184ba2da6 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 9 May 2024 15:44:03 +0200 Subject: [PATCH 16/27] Check for illegal state in tab delim. data update It's dangerous as we would further mess up the data in the row --- .../cbio/portal/scripts/ImportTabDelimData.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 4a118bcc..17ba7104 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -421,6 +421,8 @@ private void ensureSampleGeneticProfile(Sample sample) throws DaoException { private void saveOrderedSampleList() throws DaoException { if (updateMode) { ArrayList savedOrderedSampleList = DaoGeneticProfileSamples.getOrderedSampleList(geneticProfileId); + int initialOrderSampleListSize = savedOrderedSampleList.size(); + checkSamplesInDataEqualTo(initialOrderSampleListSize); // add all new sample ids at the end ArrayList extendedSampleList = new ArrayList<>(savedOrderedSampleList); List newSampleIds = orderedSampleList.stream().filter(sampleId -> !savedOrderedSampleList.contains(sampleId)).toList(); @@ -434,6 +436,17 @@ private void saveOrderedSampleList() throws DaoException { DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); } + private void checkSamplesInDataEqualTo(int initialOrderSampleListSize) { + geneticAlterationMap.forEach((geneticEntityId, sampleToValue) -> { + if (sampleToValue.size() != initialOrderSampleListSize) { + throw new IllegalStateException("Number of samples (" + + sampleToValue.size() + ") for genetic entity with id " + + geneticEntityId + " does not match with the number in the inital sample list (" + + initialOrderSampleListSize + ")."); + } + }); + } + //TODO move somewhere else private Map zip(K[] keys, V[] values) { Map map = new HashMap<>(); From 2d31dac87f676cc95eb432463ed841a14cf678f0 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 9 May 2024 15:52:21 +0200 Subject: [PATCH 17/27] Wire incremental tab delim. data upload to cli commands --- scripts/importer/cbioportalImporter.py | 2 ++ scripts/importer/cbioportal_common.py | 11 ++++++++++- .../mskcc/cbio/portal/scripts/ImportProfileData.java | 4 ++-- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index f52bbc6a..ea9cfa50 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -464,6 +464,8 @@ def import_incremental_data(jvm_args, data_directory, update_generic_assay_entit Load all data types that are available and support incremental upload """ for meta_file_type in INCREMENTAL_UPLOAD_SUPPORTED_META_TYPES: + if meta_file_type not in meta_file_type_to_meta_files: + continue meta_pairs = meta_file_type_to_meta_files[meta_file_type] for meta_pair in meta_pairs: meta_filename, meta_dictionary = meta_pair diff --git a/scripts/importer/cbioportal_common.py b/scripts/importer/cbioportal_common.py index eaa38a5e..798174ee 100644 --- a/scripts/importer/cbioportal_common.py +++ b/scripts/importer/cbioportal_common.py @@ -370,7 +370,16 @@ class MetaFileTypes(object): MetaFileTypes.PATIENT_ATTRIBUTES, MetaFileTypes.SAMPLE_ATTRIBUTES, MetaFileTypes.MUTATION, - # TODO Add more types here as incremental upload is enabled + MetaFileTypes.MUTATION_UNCALLED, + MetaFileTypes.EXPRESSION, + MetaFileTypes.CNA_DISCRETE, + MetaFileTypes.CNA_CONTINUOUS, + MetaFileTypes.CNA_LOG2, + MetaFileTypes.METHYLATION, + MetaFileTypes.PROTEIN, + MetaFileTypes.GENERIC_ASSAY_CONTINUOUS, + MetaFileTypes.GENERIC_ASSAY_BINARY, + MetaFileTypes.GENERIC_ASSAY_CATEGORICAL, ] IMPORTER_CLASSNAME_BY_META_TYPE = { diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index d34ab2cc..10759baa 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -127,7 +127,7 @@ public void run() { geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties"), - false, + overwriteExisting, daoGeneticAlteration, daoGene ); genericAssayProfileImporter.importData(); @@ -152,7 +152,7 @@ public void run() { geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, - false, + overwriteExisting, daoGeneticAlteration, daoGene ); String pdAnnotationsFilename = geneticProfile.getOtherMetaDataField("pd_annotations_filename"); From 49975427651a89fc7926647264714e82a0a5e2a5 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 10 May 2024 09:59:01 +0200 Subject: [PATCH 18/27] Expand README with section on how to run incremental upload --- README.md | 40 ++++++++++++++++++++++++++++------------ 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 252b7ba9..51ccaf64 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,34 @@ This repo contains: ## Inclusion in main codebase The `cbioportal-core` code is currently included in the final Docker image during the Docker build process: https://github.com/cBioPortal/cbioportal/blob/master/docker/web-and-data/Dockerfile#L48 +## Running in docker + +Build docker image with: +```bash +docker build -t cbioportal-core . +``` + +Example of how to start loading of the whole study: +```bash +docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -s /data/study_es_0 -p /data/api_json -o +``` + +### Incremental upload of data + +To add or update specific patient, sample, or molecular data in an already loaded study, you can perform an incremental upload. This process is quicker than reloading the entire study. + +To execute an incremental upload, use the -d (or --data_directory) option instead of -s (or --study_directory). Here is an example command: +```bash +docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -d /data/study_es_0_inc -p /data/api_json -o +``` +**Note:** +While the directory should adhere to the standard cBioPortal file formats and study structure, please note the following specific guidelines for incremental uploads: + +- Incremental uploads are not supported for all data types. For instance, uploading study metadata, resources, or GSVA data incrementally is currently unsupported. +- The data pertaining to patient or sample IDs should only include entries that are either new or need updates. + +This method ensures efficient updates without the need for complete study reuploads, saving time and computational resources. + ## How to run integration tests This section guides you through the process of running integration tests by setting up a cBioPortal MySQL database environment using Docker. Please follow these steps carefully to ensure your testing environment is configured correctly. @@ -119,15 +147,3 @@ The script will search for `core-*.jar` in the root of the project: python scripts/importer/metaImport.py -s tests/test_data/study_es_0 -p tests/test_data/api_json_unit_tests -o ``` -## Running in docker - -Build docker image with: -```bash -docker build -t cbioportal-core . -``` - -Example of how to start the loading: -```bash -docker run -it -v $(pwd)/data/:/data/ -v $(pwd)/application.properties:/application.properties cbioportal-core python importer/metaImport.py -s /data/study_es_0 -p /data/api_json -o -``` - From 911ae2868b047b31402ac7e55c04e7af4a1aca3d Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 10 May 2024 10:39:19 +0200 Subject: [PATCH 19/27] Address TODOs in tab delim. importer --- .../portal/scripts/ImportTabDelimData.java | 20 +----------------- .../org/mskcc/cbio/portal/util/ArrayUtil.java | 21 +++++++++++++++++++ 2 files changed, 22 insertions(+), 19 deletions(-) create mode 100644 src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 17ba7104..cb613b08 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -447,21 +447,6 @@ private void checkSamplesInDataEqualTo(int initialOrderSampleListSize) { }); } - //TODO move somewhere else - private Map zip(K[] keys, V[] values) { - Map map = new HashMap<>(); - - // Check if both arrays have the same length - if (keys.length == values.length) { - for (int i = 0; i < keys.length; i++) { - map.put(keys[i], values[i]); - } - } else { - throw new IllegalArgumentException("Arrays must be of the same length"); - } - return map; - } - private Map, Map> readPdAnnotations(File pdAnnotationsFile) { Map, Map> pdAnnotations = new HashMap<>(); BufferedReader reader; @@ -742,8 +727,6 @@ private boolean saveValues(CanonicalGene canonicalGene, String[] values, String } return geneticAlterationImporter.store(values, canonicalGene, geneSymbol); } - //TODO unify saveValues versions - // With update mode the last duplicate wins. It's different from the other function private boolean saveValues(int geneticEntityId, String[] values) throws DaoException { if (updateMode) { daoGeneticAlteration.deleteAllRecordsInGeneticProfile(geneticProfile.getGeneticProfileId(), geneticEntityId); @@ -753,8 +736,7 @@ private boolean saveValues(int geneticEntityId, String[] values) throws DaoExcep } private String[] updateValues(int geneticEntityId, String[] values) { - //TODO swap variables - Map sampleIdToValue = zip(orderedImportedSampleList.toArray(new Integer[0]), values); + Map sampleIdToValue = ArrayUtil.zip(orderedImportedSampleList.toArray(new Integer[0]), values); String[] updatedSampleValues = new String[orderedSampleList.size()]; for (int i = 0; i < orderedSampleList.size(); i++) { updatedSampleValues[i] = ""; diff --git a/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java b/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java new file mode 100644 index 00000000..3235d33e --- /dev/null +++ b/src/main/java/org/mskcc/cbio/portal/util/ArrayUtil.java @@ -0,0 +1,21 @@ +package org.mskcc.cbio.portal.util; + +import java.util.HashMap; +import java.util.Map; + +public class ArrayUtil { + public static Map zip(K[] keys, V[] values) { + Map map = new HashMap<>(); + + // Check if both arrays have the same length + if (keys.length == values.length) { + for (int i = 0; i < keys.length; i++) { + map.put(keys[i], values[i]); + } + } else { + throw new IllegalArgumentException("Arrays must be of the same length"); + } + return map; + + } +} \ No newline at end of file From c7343f9ca23c95d8c08729d4fc59b8268b9262ac Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 10 May 2024 11:39:02 +0200 Subject: [PATCH 20/27] Add more data types to incremental data upload folder --- tests/system_tests_import_data.py | 15 +++++++++++++++ .../study_es_0_inc/data_cna_discrete.txt | 10 ++++++++++ tests/test_data/study_es_0_inc/data_cna_log2.txt | 10 ++++++++++ .../study_es_0_inc/data_expression_median.txt | 10 ++++++++++ .../study_es_0_inc/data_methylation_hm27.txt | 10 ++++++++++ .../study_es_0_inc/data_treatment_ic50.txt | 11 +++++++++++ .../study_es_0_inc/meta_cna_discrete.txt | 10 ++++++++++ tests/test_data/study_es_0_inc/meta_cna_log2.txt | 8 ++++++++ .../study_es_0_inc/meta_expression_median.txt | 8 ++++++++ .../study_es_0_inc/meta_methylation_hm27.txt | 8 ++++++++ .../study_es_0_inc/meta_treatment_ic50.txt | 12 ++++++++++++ 11 files changed, 112 insertions(+) create mode 100644 tests/test_data/study_es_0_inc/data_cna_discrete.txt create mode 100644 tests/test_data/study_es_0_inc/data_cna_log2.txt create mode 100644 tests/test_data/study_es_0_inc/data_expression_median.txt create mode 100644 tests/test_data/study_es_0_inc/data_methylation_hm27.txt create mode 100644 tests/test_data/study_es_0_inc/data_treatment_ic50.txt create mode 100644 tests/test_data/study_es_0_inc/meta_cna_discrete.txt create mode 100644 tests/test_data/study_es_0_inc/meta_cna_log2.txt create mode 100644 tests/test_data/study_es_0_inc/meta_expression_median.txt create mode 100644 tests/test_data/study_es_0_inc/meta_methylation_hm27.txt create mode 100644 tests/test_data/study_es_0_inc/meta_treatment_ic50.txt diff --git a/tests/system_tests_import_data.py b/tests/system_tests_import_data.py index 097e6c01..64361571 100755 --- a/tests/system_tests_import_data.py +++ b/tests/system_tests_import_data.py @@ -107,6 +107,16 @@ def test_incremental_load(self, run_java, locate_jar): '--meta', f'{data_directory}/meta_clinical_samples.txt', '--loadMode', 'bulkload', '--data', f'{data_directory}/data_clinical_samples.txt', '--noprogress') mutation_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', '--meta', f'{data_directory}/meta_mutations_extended.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_mutations_extended.maf', '--noprogress') + cna_discrete_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_discrete.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_discrete.txt', '--noprogress') + cna_log2_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_cna_log2.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_cna_log2.txt', '--noprogress') + expression_median_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_expression_median.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_expression_median.txt', '--noprogress') + methylation_hm27_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_methylation_hm27.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_methylation_hm27.txt', '--noprogress') + treatment_ic50_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.ImportProfileData', '--overwrite-existing', + '--meta', f'{data_directory}/meta_treatment_ic50.txt', '--loadMode', 'bulkload', '--update-info', 'False', '--data', f'{data_directory}/data_treatment_ic50.txt', '--noprogress') case_list_call = call(*common_part, 'org.mskcc.cbio.portal.scripts.UpdateCaseListsSampleIds', '--meta', f'{data_directory}/meta_clinical_samples.txt', '--case-lists', f'{data_directory}/case_lists') @@ -115,6 +125,11 @@ def test_incremental_load(self, run_java, locate_jar): clinical_patient_call, clinical_sample_call, mutation_call, + cna_discrete_call, + cna_log2_call, + expression_median_call, + methylation_hm27_call, + treatment_ic50_call, case_list_call, ]) diff --git a/tests/test_data/study_es_0_inc/data_cna_discrete.txt b/tests/test_data/study_es_0_inc/data_cna_discrete.txt new file mode 100644 index 00000000..7915f45b --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_discrete.txt @@ -0,0 +1,10 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0 0 -1 + 375790 -1 -1 0 +ATAD3A 55210 0 0 -2 +ATAD3B 83858 -2 -1 0 +ATAD3C 219293 0 0 0 +#AURKAIP1 54998 +ERCC5 2073 0 -1 -2 +ACP3 55 0 0 0 +TP53 -1 0 -2 diff --git a/tests/test_data/study_es_0_inc/data_cna_log2.txt b/tests/test_data/study_es_0_inc/data_cna_log2.txt new file mode 100644 index 00000000..0eb820a7 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_cna_log2.txt @@ -0,0 +1,10 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.751 0.533 0.114 + 375790 0.062 0.071 0.948 +ATAD3A 55210 0.487 0.695 0.364 +ATAD3B 83858 0.150 0.492 0.300 +ATAD3C 219293 0.995 0.170 0.654 +#AURKAIP1 54998 +ERCC5 2073 0.816 0.514 0.165 +ACP3 55 0.252 0.713 0.513 +TP53 0.360 0.538 0.891 diff --git a/tests/test_data/study_es_0_inc/data_expression_median.txt b/tests/test_data/study_es_0_inc/data_expression_median.txt new file mode 100644 index 00000000..d5c4a9a0 --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_expression_median.txt @@ -0,0 +1,10 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.096 0.826 0.032 + 375790 0.309 0.399 0.680 +ATAD3A 55210 0.569 0.189 0.266 +ATAD3B 83858 0.829 0.473 0.611 +ATAD3C 219293 0.307 0.445 0.045 +#AURKAIP1 54998 +ERCC5 2073 0.171 0.766 0.590 +ACP3 55 0.422 0.870 0.745 +TP53 0.179 0.694 0.808 diff --git a/tests/test_data/study_es_0_inc/data_methylation_hm27.txt b/tests/test_data/study_es_0_inc/data_methylation_hm27.txt new file mode 100644 index 00000000..d2c67abc --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_methylation_hm27.txt @@ -0,0 +1,10 @@ +Hugo_Symbol Entrez_Gene_Id TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +ACAP3 116983 0.022 0.681 0.790 + 375790 0.435 0.340 0.321 +ATAD3A 55210 0.229 0.946 0.439 +ATAD3B 83858 0.885 0.707 0.664 +ATAD3C 219293 0.660 0.315 0.694 +#AURKAIP1 54998 +ERCC5 2073 0.436 0.749 0.345 +ACP3 55 0.622 0.396 0.029 +TP53 0.563 0.686 0.607 diff --git a/tests/test_data/study_es_0_inc/data_treatment_ic50.txt b/tests/test_data/study_es_0_inc/data_treatment_ic50.txt new file mode 100644 index 00000000..2a507cef --- /dev/null +++ b/tests/test_data/study_es_0_inc/data_treatment_ic50.txt @@ -0,0 +1,11 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB-01 TCGA-A1-A0SB-03 TCGA-BH-NEW +17-AAG Name of 17-AAG Desc of 17-AAG Url of 17-AAG 0.315 0.329701692 0.053038094 +AEW541 Name of AEW541 Desc of AEW541 Url of AEW541 >8 2.353 2.68212986 +AZD0530 Name of AZD0530 Desc of AZD0530 Url of AZD0530 0.234 >8 4.597949505 +AZD6244 Name of AZD6244 Desc of AZD6244 Url of AZD6244 >8 >8 >8 +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 >8 >8 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan NA 0.083 NA +L-685458 Name of L-685458 Desc of L-685458 Url of L-685458 >8 >8 3.267752409 +#Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 NA >8 >8 +Nilotinib Name of Nilotinib Desc of Nilotinib Url of Nilotinib >8 >8 NA diff --git a/tests/test_data/study_es_0_inc/meta_cna_discrete.txt b/tests/test_data/study_es_0_inc/meta_cna_discrete.txt new file mode 100644 index 00000000..f6ea8bea --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_discrete.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete.txt +pd_annotations_filename: data_cna_pd_annotations.txt +namespaces: CustomNamespace diff --git a/tests/test_data/study_es_0_inc/meta_cna_log2.txt b/tests/test_data/study_es_0_inc/meta_cna_log2.txt new file mode 100644 index 00000000..74a07b8e --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_cna_log2.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: LOG2-VALUE +stable_id: log2CNA +show_profile_in_analysis_tab: false +profile_description: Log2 copy-number values for each gene (from Affymetrix SNP6). +profile_name: Log2 copy-number values +data_filename: data_cna_log2.txt diff --git a/tests/test_data/study_es_0_inc/meta_expression_median.txt b/tests/test_data/study_es_0_inc/meta_expression_median.txt new file mode 100644 index 00000000..1e2fc6a7 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_expression_median.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: MRNA_EXPRESSION +datatype: CONTINUOUS +stable_id: mrna +profile_description: Expression levels (Agilent microarray). +show_profile_in_analysis_tab: false +profile_name: mRNA expression (microarray) +data_filename: data_expression_median.txt diff --git a/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt b/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt new file mode 100644 index 00000000..582b12e9 --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_methylation_hm27.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: METHYLATION +datatype: CONTINUOUS +stable_id: methylation_hm27 +profile_description: Methylation beta-values (HM27 platform). For genes with multiple methylation probes, the probe least correlated with expression is selected. +show_profile_in_analysis_tab: false +profile_name: Methylation (HM27) +data_filename: data_methylation_hm27.txt diff --git a/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt new file mode 100644 index 00000000..0d3281cd --- /dev/null +++ b/tests/test_data/study_es_0_inc/meta_treatment_ic50.txt @@ -0,0 +1,12 @@ +cancer_study_identifier: study_es_0 +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL \ No newline at end of file From 2ed0bd85cccd081c5eb7708b6929e894d6eb614a Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Wed, 15 May 2024 11:42:49 +0200 Subject: [PATCH 21/27] Remove obsolete TODO comment --- .../java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java index fcc2380e..0358b132 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoGeneticAlteration.java @@ -95,7 +95,6 @@ public int addGeneticAlterations(int geneticProfileId, long entrezGeneId, String return addGeneticAlterationsForGeneticEntity(geneticProfileId, DaoGeneOptimized.getGeneticEntityId(entrezGeneId), values); } - // TODO inc: update instead public int addGeneticAlterationsForGeneticEntity(int geneticProfileId, int geneticEntityId, String[] values) throws DaoException { From 76b52a9ba3e5d1915b95b478158149f4f93b1109 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Thu, 16 May 2024 22:54:02 +0200 Subject: [PATCH 22/27] Reuse genetic_profile record if it exists in db already Do it for all data types, not only MAF --- .../portal/util/GeneticProfileReader.java | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java index af686a72..ab862756 100644 --- a/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java +++ b/src/main/java/org/mskcc/cbio/portal/util/GeneticProfileReader.java @@ -76,22 +76,25 @@ public static GeneticProfile loadGeneticProfile(File file) throws IOException, D GeneticProfile geneticProfile = loadGeneticProfileFromMeta(file); GeneticProfile existingGeneticProfile = DaoGeneticProfile.getGeneticProfileByStableId(geneticProfile.getStableId()); if (existingGeneticProfile != null) { - if (!existingGeneticProfile.getDatatype().equals("MAF")) { - // the dbms already contains a GeneticProfile with the file's stable_id. This scenario is not supported - // anymore, so throw error telling user to remove existing profile first: - throw new RuntimeException("Error: genetic_profile record found with same Stable ID as the one used in your data: " - + existingGeneticProfile.getStableId() + ". Remove the existing genetic_profile record first."); - } else { - // For mutation data only we can have multiple files with the same genetic_profile. - // There is a constraint in the mutation database table to prevent duplicated data - // If this constraint is hit (mistakenly importing the same maf twice) MySqlBulkLoader will throw an exception - // - // make an object combining the pre-existing profile with the file-specific properties of the current file - GeneticProfile gp = new GeneticProfile(existingGeneticProfile); - gp.setTargetLine(gp.getTargetLine()); - gp.setOtherMetadataFields(gp.getAllOtherMetadataFields()); - return gp; + ProgressMonitor.setCurrentMessage("genetic_profile record found with same Stable ID (" + geneticProfile.getStableId() + + "). Using it instead."); + if (geneticProfile.getGeneticAlterationType() != existingGeneticProfile.getGeneticAlterationType()) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different genetic alteration type: " + + existingGeneticProfile.getGeneticProfileId()); } + if (!existingGeneticProfile.getDatatype().equals(geneticProfile.getDatatype())) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different data type: " + + existingGeneticProfile.getDatatype()); + } + if (geneticProfile.getCancerStudyId() != existingGeneticProfile.getCancerStudyId()) { + throw new IllegalStateException("genetic_profile record found with same Stable ID (" + + existingGeneticProfile.getStableId() + ") but different cancer study (id=" + + existingGeneticProfile.getCancerStudyId() + ")"); + } + existingGeneticProfile.setOtherMetadataFields(geneticProfile.getAllOtherMetadataFields()); + return existingGeneticProfile; } // For GSVA profiles, we want to create a geneticProfileLink from source_stable_id for: From fa160767b96376271a1cddc4a9cfb56c0c10159e Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 00:16:31 +0200 Subject: [PATCH 23/27] Test incremental upload of tab delim. data types from umbrella script - Split big tab. delim test to multiple tests based on data type. - Use ImportProfileData instead of ImportTabDelimData for testing. - We cover more logic with such tests. - This is more stable interface. ImportTabDelimData can be refactored. --- .../scripts/ImportGenericAssayEntity.java | 8 +- .../GeneticAlterationsTestHelper.java | 55 +++ ...IncrementalCopyNumberAlterationImport.java | 177 +++++++ .../TestIncrementalGenericAssayImporter.java | 136 ++++++ .../TestIncrementalGsvaImporter.java | 81 ++++ .../TestIncrementalMrnaExpressionImport.java | 119 +++++ .../TestIncrementalProteinLevelImport.java | 122 +++++ .../TestIncrementalTabDelimData.java | 441 ------------------ .../data_cna_discrete.txt | 0 .../data_cna_pd_annotations.txt | 0 .../meta_cna_discrete.txt | 10 + .../data_treatment_ic50.txt | 2 +- .../generic_assay/meta_treatment_ic50.txt | 12 + .../data_expression_Zscores.txt | 0 .../meta_expression_Zscores.txt | 8 + .../data_rppa.txt | 0 .../incremental/protein_level/meta_rppa.txt | 7 + src/test/resources/seed_mini.sql | 2 +- 18 files changed, 736 insertions(+), 444 deletions(-) create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java create mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java delete mode 100644 src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java rename src/test/resources/incremental/{tab_delim_data => copy_number_alteration}/data_cna_discrete.txt (100%) rename src/test/resources/incremental/{tab_delim_data => copy_number_alteration}/data_cna_pd_annotations.txt (100%) create mode 100644 src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt rename src/test/resources/incremental/{tab_delim_data => generic_assay}/data_treatment_ic50.txt (83%) create mode 100644 src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt rename src/test/resources/incremental/{tab_delim_data => mrna_expression}/data_expression_Zscores.txt (100%) create mode 100644 src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt rename src/test/resources/incremental/{tab_delim_data => protein_level}/data_rppa.txt (100%) create mode 100644 src/test/resources/incremental/protein_level/meta_rppa.txt diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java index 2da0ebd2..7da2e983 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayEntity.java @@ -40,9 +40,11 @@ import java.io.File; import java.io.FileReader; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; +import java.util.stream.Collectors; import org.cbioportal.model.EntityType; import org.cbioportal.model.GenericEntityProperty; @@ -50,6 +52,7 @@ import org.mskcc.cbio.portal.dao.DaoGenericAssay; import org.mskcc.cbio.portal.dao.DaoGeneticEntity; import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import joptsimple.OptionParser; @@ -160,7 +163,6 @@ public static void startImport(OptionSet options, OptionSpec data, Optio * @throws Exception */ public static void importData(File dataFile, GeneticAlterationType geneticAlterationType, String additionalProperties, boolean updateInfo) throws Exception { - ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile.getCanonicalPath()); // read generic assay data file @@ -186,6 +188,10 @@ public static void importData(File dataFile, GeneticAlterationType geneticAltera currentLine = buf.readLine(); while (currentLine != null) { + if (!FileUtil.isInfoLine(currentLine)) { + currentLine = buf.readLine(); + continue; + } String[] parts = currentLine.split("\t"); diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java new file mode 100644 index 00000000..fdf36995 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/GeneticAlterationsTestHelper.java @@ -0,0 +1,55 @@ +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.jetbrains.annotations.NotNull; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; + +import java.util.HashMap; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class GeneticAlterationsTestHelper { + @NotNull + public static Set geneStableIdsToEntityIds(Set beforeStableIds) { + return beforeStableIds.stream().map(stableId -> { + try { + return geneStableIdToEntityId(stableId); + } catch (DaoException e) { + throw new RuntimeException(e); + } + }).collect(Collectors.toSet()); + } + + public static int geneStableIdToEntityId(String stableId) throws DaoException { + return DaoGeneticEntity.getGeneticEntityByStableId(stableId).getId(); + } + + public static void assertPriorDataState(HashMap> beforeResult, Set expectedEntityIds, Set expectedSampleIds) { + assertEquals(expectedEntityIds, beforeResult.keySet()); + beforeResult.forEach((entityId, sampleIdToValue) -> { + assertEquals("Samples for gene with entityId = " + entityId + " have to match expected ones", + expectedSampleIds, beforeResult.get(entityId).keySet()); + }); + } + + public static void assertNoChange(HashMap> beforeResult, + HashMap> afterResult, + Set entityIds, + Set sampleIds) { + entityIds.forEach(entityId -> { + assertTrue("After result is expected to contain entityId=" + entityId, + afterResult.containsKey(entityId)); + sampleIds.forEach(sampleId -> { + assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entityId=" + entityId, + afterResult.get(entityId).containsKey(sampleId)); + assertEquals("The values for sample_id=" + sampleId + + " and entityId=" + entityId + " before and after upload have to match.", + beforeResult.get(entityId).get(sampleId), afterResult.get(entityId).get(sampleId)); + }); + }); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java new file mode 100644 index 00000000..ad3ebd55 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalCopyNumberAlterationImport.java @@ -0,0 +1,177 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.cbioportal.model.CNA; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCnaEvent; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.CnaEvent; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of PROTEIN_LEVEL Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +@Rollback +@Transactional +public class TestIncrementalCopyNumberAlterationImport { + + /** + * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE (gistic) + */ + @Test + public void testDiscreteCNA() throws DaoException, IOException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ATM + final long absentGeneEntrezId = 472l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-XX-0800 + final int newSampleId = 15; + // stable_id: TCGA-A1-A0SO + final int updateSampleId = 12; + final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + final Set afterSampleIds = new HashSet<>(beforeSampleIds); + afterSampleIds.add(newSampleId); + + GeneticProfile discreteCNAProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); + assertNotNull(discreteCNAProfile); + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + List allCnaLevels = Arrays.stream(CNA.values()).map(CNA::getCode).toList(); + Set beforeCnaEventsSampleIds = Set.of(4, 13, 14, updateSampleId); + List beforeSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + null, + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> beforeSampleIdToSampleCnaEvents = beforeSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals(beforeCnaEventsSampleIds, beforeSampleIdToSampleCnaEvents.keySet()); + + File dataFolder = new File("src/test/resources/incremental/copy_number_alteration/"); + File metaFile = new File(dataFolder, "meta_cna_discrete.txt"); + File dataFile = new File(dataFolder, "data_cna_discrete.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + + List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), + afterResult.keySet(), + discreteCNAProfile.getGeneticProfileId(), + allCnaLevels); + Map> afterSampleIdToSampleCnaEvents = afterSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); + assertEquals("There is only one new sample that has to gain cna events", beforeCnaEventsSampleIds.size() + 1, afterSampleIdToSampleCnaEvents.size()); + beforeCnaEventsSampleIds.forEach(sampleId -> { + if (sampleId == updateSampleId) { + return; + } + Set beforeCnaEvents = beforeSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + Set afterCnaEvents = afterSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); + assertEquals("CNA events for sample_id=" + sampleId + " must not change.", beforeCnaEvents, afterCnaEvents); + }); + Map newSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(newSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 208l, CNA.HOMDEL, + 3265l, CNA.AMP, + 4893l, CNA.HOMDEL, + 672l, CNA.AMP, + 673l, CNA.AMP, + 675l, CNA.HOMDEL, + newGeneEntrezId, CNA.HOMDEL + ), + newSampleEntrezGeneIdToCnaAlteration); + Map updatedSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(updateSampleId).stream() + .map(CnaEvent::getEvent) + .collect(Collectors.toMap( + event -> event.getGene().getEntrezGeneId(), + CnaEvent.Event::getAlteration)); + assertEquals(Map.of( + 10000l, CNA.HOMDEL, + 207l, CNA.AMP, + 3845l, CNA.AMP, + 673l, CNA.HOMDEL, + newGeneEntrezId, CNA.AMP + ), + updatedSampleEntrezGeneIdToCnaAlteration); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java new file mode 100644 index 00000000..3162f6a3 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java @@ -0,0 +1,136 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdToEntityId; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdsToEntityIds; + +/** + * Tests Incremental Import of Generic Assay data + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalGenericAssayImporter { + + /** + * Test incremental upload of GENERIC_ASSAY + */ + @Test + public void testGenericAssay() throws DaoException, IOException { + /** + * Prior checks + */ + // Stable id that is part of the platform, but absent during the incremental upload + final String absentStableId = "L-685458"; + final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); + final Set beforeStableIds = new HashSet<>(noChangeStableIds); + beforeStableIds.add(absentStableId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + // stable_id: TCGA-A1-A0SE-01 + final int noChangeSampleId = 3; + final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); + + GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + assertNotNull(ic50Profile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); + assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/generic_assay/"); + File metaFile = new File(dataFolder, "meta_treatment_ic50.txt"); + File dataFile = new File(dataFolder, "data_treatment_ic50.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + assertEquals("After result should have +1 amount of entries", beforeResult.size() + 1, afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, geneStableIdsToEntityIds(noChangeStableIds), Set.of(noChangeSampleId)); + int erlotinibEntityId = geneStableIdToEntityId("Erlotinib"); + assertEquals(">8", afterResult.get(erlotinibEntityId).get(newSampleId)); + assertEquals("7.5", afterResult.get(erlotinibEntityId).get(updateSampleId)); + int irinotecanEntityId = geneStableIdToEntityId("Irinotecan"); + assertEquals("", afterResult.get(irinotecanEntityId).get(newSampleId)); + assertEquals("0.081", afterResult.get(irinotecanEntityId).get(updateSampleId)); + int absentEntityId = geneStableIdToEntityId(absentStableId); + assertEquals("", afterResult.get(absentEntityId).get(newSampleId)); + assertEquals("", afterResult.get(absentEntityId).get(updateSampleId)); + int lapatinibEntityId = geneStableIdToEntityId("Lapatinib"); + assertEquals("6.2", afterResult.get(lapatinibEntityId).get(newSampleId)); + assertEquals("7.848", afterResult.get(lapatinibEntityId).get(updateSampleId)); + int lbw242EntityId = geneStableIdToEntityId("LBW242"); + assertEquals("0.1", afterResult.get(lbw242EntityId).get(newSampleId)); + assertEquals(">~8", afterResult.get(lbw242EntityId).get(updateSampleId)); + assertNotNull("New generic entity has to be added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java new file mode 100644 index 00000000..c629ecb4 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGsvaImporter.java @@ -0,0 +1,81 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . +*/ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneOptimized; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticEntity; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticAlterationType; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportTabDelimData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdToEntityId; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdsToEntityIds; + +/** + * Tests Incremental Import is not supported for GSVA data type + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) +@Rollback +@Transactional +public class TestIncrementalGsvaImporter { + @Test + public void testGsvaIsNotSupported() throws DaoException, IOException { + GeneticProfile gsvaProfile = new GeneticProfile(); + gsvaProfile.setCancerStudyId(DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub").getInternalId()); + gsvaProfile.setStableId("gsva_scores"); + gsvaProfile.setDatatype("GENESET_SCORE"); + gsvaProfile.setGeneticAlterationType(GeneticAlterationType.GENESET_SCORE); + gsvaProfile.setProfileName("gsva test platform"); + DaoGeneticProfile.addGeneticProfile(gsvaProfile); + + assertThrows(UnsupportedOperationException.class, () -> + new ImportTabDelimData(File.createTempFile("gsva", "test"), + DaoGeneticProfile.getGeneticProfileByStableId("gsva_scores").getGeneticProfileId(), + null, + true, + DaoGeneticAlteration.getInstance(), + DaoGeneOptimized.getInstance())); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java new file mode 100644 index 00000000..d44ccee5 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalMrnaExpressionImport.java @@ -0,0 +1,119 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.io.IOException; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of MRNA_EXPRESSION Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +@Rollback +@Transactional +public class TestIncrementalMrnaExpressionImport { + + /** + * Test incremental upload of MRNA_EXPRESSION + */ + @Test + public void testMrnaExpression() throws DaoException, IOException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); + assertNotNull(mrnaProfile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/mrna_expression/"); + File metaFile = new File(dataFolder, "meta_expression_Zscores.txt"); + File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample", beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + HashMap newGeneRow = afterResult.get(newGeneEntrezId); + assertEquals("-0.1735", newGeneRow.get(newSampleId)); + assertEquals("-0.6412", newGeneRow.get(updateSampleId)); + HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); + assertEquals("", absentGeneRow.get(newSampleId)); + assertEquals("", absentGeneRow.get(updateSampleId)); + } +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java new file mode 100644 index 00000000..f3933b27 --- /dev/null +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalProteinLevelImport.java @@ -0,0 +1,122 @@ +/* + * This file is part of cBioPortal. + * + * cBioPortal is free software: you can redistribute it and/or modify + * it under the terms of the GNU Affero General Public License as + * published by the Free Software Foundation, either version 3 of the + * License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Affero General Public License for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see . + */ + +package org.mskcc.cbio.portal.integrationTest.incremental; + +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.mskcc.cbio.portal.dao.DaoCancerStudy; +import org.mskcc.cbio.portal.dao.DaoException; +import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; +import org.mskcc.cbio.portal.dao.DaoGeneticProfile; +import org.mskcc.cbio.portal.model.GeneticProfile; +import org.mskcc.cbio.portal.scripts.ImportProfileData; +import org.springframework.test.annotation.Rollback; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; +import org.springframework.transaction.annotation.Transactional; + +import java.io.File; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Set; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; +import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; + +/** + * Tests Incremental Import of PROTEIN_LEVEL Data. + * + * @author Ruslan Forostianov + * @author Pieter Lukasse + */ +@RunWith(SpringJUnit4ClassRunner.class) +@ContextConfiguration(locations = {"classpath:/applicationContext-dao.xml"}) +@Rollback +@Transactional +public class TestIncrementalProteinLevelImport { + + /** + * Test incremental upload of PROTEIN_LEVEL + */ + @Test + public void testRppa() throws DaoException { + /** + * Prior checks + */ + // Hugo_Symbol: CDK1 + final long newGeneEntrezId = 983l; + // Gene that is part of the platform, but absent during the incremental upload + // Hugo_Symbol: ARAF + final long absentGeneEntrezId = 369l; + final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); + final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); + beforeEntrezIds.add(absentGeneEntrezId); + + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); + final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); + beforeSampleIds.add(updateSampleId); + + GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); + assertNotNull(rppaProfile); + + HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); + + File dataFolder = new File("src/test/resources/incremental/protein_level/"); + File metaFile = new File(dataFolder, "meta_rppa.txt"); + File dataFile = new File(dataFolder, "data_rppa.txt"); + + /** + * Test + */ + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + + /** + * After test assertions + */ + HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); + assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, + afterResult.size()); + afterResult.values() + .forEach(sampleToValue -> + assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); + assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); + assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); + assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); + assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); + } + + @Before + public void setUp() throws DaoException { + DaoCancerStudy.reCacheAll(); + } + +} diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java deleted file mode 100644 index 4f4b2aef..00000000 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalTabDelimData.java +++ /dev/null @@ -1,441 +0,0 @@ -/* - * This file is part of cBioPortal. - * - * cBioPortal is free software: you can redistribute it and/or modify - * it under the terms of the GNU Affero General Public License as - * published by the Free Software Foundation, either version 3 of the - * License. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU Affero General Public License for more details. - * - * You should have received a copy of the GNU Affero General Public License - * along with this program. If not, see . -*/ - -package org.mskcc.cbio.portal.integrationTest.incremental; - -import org.cbioportal.model.CNA; -import org.jetbrains.annotations.NotNull; -import org.junit.Before; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.mskcc.cbio.portal.dao.DaoCancerStudy; -import org.mskcc.cbio.portal.dao.DaoCnaEvent; -import org.mskcc.cbio.portal.dao.DaoException; -import org.mskcc.cbio.portal.dao.DaoGeneOptimized; -import org.mskcc.cbio.portal.dao.DaoGeneticAlteration; -import org.mskcc.cbio.portal.dao.DaoGeneticEntity; -import org.mskcc.cbio.portal.dao.DaoGeneticProfile; -import org.mskcc.cbio.portal.model.CnaEvent; -import org.mskcc.cbio.portal.model.GeneticAlterationType; -import org.mskcc.cbio.portal.model.GeneticProfile; -import org.mskcc.cbio.portal.scripts.ImportTabDelimData; -import org.springframework.test.annotation.Rollback; -import org.springframework.test.context.ContextConfiguration; -import org.springframework.test.context.junit4.SpringJUnit4ClassRunner; -import org.springframework.transaction.annotation.Transactional; - -import java.io.File; -import java.io.IOException; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.stream.Collectors; - -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; -import static org.junit.Assert.assertNull; -import static org.junit.Assert.assertThrows; -import static org.junit.Assert.assertTrue; - -/** - * Tests Incremental Import of Tab Delimited Data. - * - * @author Ruslan Forostianov - * @author Pieter Lukasse - */ -@RunWith(SpringJUnit4ClassRunner.class) -@ContextConfiguration(locations = { "classpath:/applicationContext-dao.xml" }) -@Rollback -@Transactional -public class TestIncrementalTabDelimData { - - @Before - public void setUp() throws DaoException { - DaoCancerStudy.reCacheAll(); - } - - /** - * Test incremental upload of MRNA_EXPRESSION - */ - @Test - public void testMrnaExpression() throws DaoException, IOException { - /** - * Prior checks - */ - // Hugo_Symbol: CDK1 - final long newGeneEntrezId = 983l; - // Gene that is part of the platform, but absent during the incremental upload - // Hugo_Symbol: ARAF - final long absentGeneEntrezId = 369l; - final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); - final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); - beforeEntrezIds.add(absentGeneEntrezId); - - // stable_id: TCGA-A1-A0SB-01 - final int newSampleId = 1; - // stable_id: TCGA-A1-A0SD-01 - final int updateSampleId = 2; - final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); - final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); - beforeSampleIds.add(updateSampleId); - - GeneticProfile mrnaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_mrna"); - assertNotNull(mrnaProfile); - - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); - - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); - File dataFile = new File(dataFolder, "data_expression_Zscores.txt"); - - /** - * Test - */ - new ImportTabDelimData(dataFile, - mrnaProfile.getGeneticProfileId(), - null, - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance()).importData(); - - /** - * After test assertions - */ - HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(mrnaProfile.getGeneticProfileId(), null); - assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, - afterResult.size()); - afterResult.values() - .forEach(sampleToValue -> - assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); - assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); - HashMap newGeneRow = afterResult.get(newGeneEntrezId); - assertEquals("-0.1735", newGeneRow.get(newSampleId)); - assertEquals("-0.6412", newGeneRow.get(updateSampleId)); - HashMap absentGeneRow = afterResult.get(absentGeneEntrezId); - assertEquals("", absentGeneRow.get(newSampleId)); - assertEquals("", absentGeneRow.get(updateSampleId)); - } - - /** - * Test incremental upload of PROTEIN_LEVEL - */ - @Test - public void testRppa() throws DaoException, IOException { - /** - * Prior checks - */ - // Hugo_Symbol: CDK1 - final long newGeneEntrezId = 983l; - // Gene that is part of the platform, but absent during the incremental upload - // Hugo_Symbol: ARAF - final long absentGeneEntrezId = 369l; - final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 472l, 4893l, 672l, 673l, 675l); - final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); - beforeEntrezIds.add(absentGeneEntrezId); - - // stable_id: TCGA-A1-A0SB-01 - final int newSampleId = 1; - // stable_id: TCGA-A1-A0SD-01 - final int updateSampleId = 2; - final Set noChangeSampleIds = Set.of(3, 6, 8, 9, 10, 12, 13); - final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); - beforeSampleIds.add(updateSampleId); - - GeneticProfile rppaProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_rppa"); - assertNotNull(rppaProfile); - - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); - - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); - File dataFile = new File(dataFolder, "data_rppa.txt"); - - /** - * Test - */ - new ImportTabDelimData(dataFile, - rppaProfile.getGeneticProfileId(), - null, - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance()).importData(); - - /** - * After test assertions - */ - HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(rppaProfile.getGeneticProfileId(), null); - assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, - afterResult.size()); - afterResult.values() - .forEach(sampleToValue -> - assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); - assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); - assertEquals("-0.141047088398489", afterResult.get(newGeneEntrezId).get(newSampleId)); - assertEquals("1.61253243564957", afterResult.get(newGeneEntrezId).get(updateSampleId)); - assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); - assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); - } - - /** - * Test incremental upload of COPY_NUMBER_ALTERATION DISCRETE (gistic) - */ - @Test - public void testDiscreteCNA() throws DaoException, IOException { - /** - * Prior checks - */ - // Hugo_Symbol: CDK1 - final long newGeneEntrezId = 983l; - // Gene that is part of the platform, but absent during the incremental upload - // Hugo_Symbol: ATM - final long absentGeneEntrezId = 472l; - final Set noChangeEntrezIds = Set.of(10000l, 207l, 208l, 3265l, 3845l, 4893l, 672l, 673l, 675l); - final Set beforeEntrezIds = new HashSet<>(noChangeEntrezIds); - beforeEntrezIds.add(absentGeneEntrezId); - - // stable_id: TCGA-XX-0800 - final int newSampleId = 15; - // stable_id: TCGA-A1-A0SO - final int updateSampleId = 12; - final Set noChangeSampleIds = Set.of(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14); - final Set beforeSampleIds = new HashSet<>(noChangeSampleIds); - beforeSampleIds.add(updateSampleId); - - final Set afterSampleIds = new HashSet<>(beforeSampleIds); - afterSampleIds.add(newSampleId); - - GeneticProfile discreteCNAProfile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_gistic"); - assertNotNull(discreteCNAProfile); - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); - assertPriorDataState(beforeResult, beforeEntrezIds, beforeSampleIds); - - List allCnaLevels = Arrays.stream(CNA.values()).map(CNA::getCode).toList(); - Set beforeCnaEventsSampleIds = Set.of(4, 13, 14, updateSampleId); - List beforeSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), - null, - discreteCNAProfile.getGeneticProfileId(), - allCnaLevels); - Map> beforeSampleIdToSampleCnaEvents = beforeSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); - assertEquals(beforeCnaEventsSampleIds, beforeSampleIdToSampleCnaEvents.keySet()); - - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); - File dataFile = new File(dataFolder, "data_cna_discrete.txt"); - File pdAnnotations = new File(dataFolder, "data_cna_pd_annotations.txt"); - - /** - * Test - */ - ImportTabDelimData importer = new ImportTabDelimData(dataFile, - discreteCNAProfile.getGeneticProfileId(), - null, - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance()); - importer.setPdAnnotationsFile(pdAnnotations); - importer.importData(); - - /** - * After test assertions - */ - HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMap(discreteCNAProfile.getGeneticProfileId(), null); - assertEquals("After result should get exactly one new gene", beforeEntrezIds.size() + 1, - afterResult.size()); - afterResult.values() - .forEach(sampleToValue -> - assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); - assertNoChange(beforeResult, afterResult, noChangeEntrezIds, noChangeSampleIds); - assertEquals("-2", afterResult.get(newGeneEntrezId).get(newSampleId)); - assertEquals("2", afterResult.get(newGeneEntrezId).get(updateSampleId)); - assertEquals("", afterResult.get(absentGeneEntrezId).get(newSampleId)); - assertEquals("", afterResult.get(absentGeneEntrezId).get(updateSampleId)); - - List afterSampleCnaEvents = DaoCnaEvent.getCnaEvents(afterSampleIds.stream().toList(), - afterResult.keySet(), - discreteCNAProfile.getGeneticProfileId(), - allCnaLevels); - Map> afterSampleIdToSampleCnaEvents = afterSampleCnaEvents.stream().collect(Collectors.groupingBy(CnaEvent::getSampleId)); - assertEquals("There is only one new sample that has to gain cna events", beforeCnaEventsSampleIds.size() + 1, afterSampleIdToSampleCnaEvents.size()); - beforeCnaEventsSampleIds.forEach(sampleId -> { - if (sampleId == updateSampleId) { - return; - } - Set beforeCnaEvents = beforeSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); - Set afterCnaEvents = afterSampleIdToSampleCnaEvents.get(sampleId).stream().map(CnaEvent::getEvent).collect(Collectors.toSet()); - assertEquals("CNA events for sample_id=" + sampleId + " must not change.", beforeCnaEvents, afterCnaEvents); - }); - Map newSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(newSampleId).stream() - .map(CnaEvent::getEvent) - .collect(Collectors.toMap( - event -> event.getGene().getEntrezGeneId(), - CnaEvent.Event::getAlteration)); - assertEquals(Map.of( - 208l, CNA.HOMDEL, - 3265l, CNA.AMP, - 4893l, CNA.HOMDEL, - 672l, CNA.AMP, - 673l, CNA.AMP, - 675l, CNA.HOMDEL, - newGeneEntrezId, CNA.HOMDEL - ), - newSampleEntrezGeneIdToCnaAlteration); - Map updatedSampleEntrezGeneIdToCnaAlteration = afterSampleIdToSampleCnaEvents.get(updateSampleId).stream() - .map(CnaEvent::getEvent) - .collect(Collectors.toMap( - event -> event.getGene().getEntrezGeneId(), - CnaEvent.Event::getAlteration)); - assertEquals(Map.of( - 10000l, CNA.HOMDEL, - 207l, CNA.AMP, - 3845l, CNA.AMP, - 673l, CNA.HOMDEL, - newGeneEntrezId, CNA.AMP - ), - updatedSampleEntrezGeneIdToCnaAlteration); - } - - @Test - public void testGsvaIsNotSupported() throws DaoException, IOException { - GeneticProfile gsvaProfile = new GeneticProfile(); - gsvaProfile.setCancerStudyId(DaoCancerStudy.getCancerStudyByStableId("study_tcga_pub").getInternalId()); - gsvaProfile.setStableId("gsva_scores"); - gsvaProfile.setDatatype("GENESET_SCORE"); - gsvaProfile.setGeneticAlterationType(GeneticAlterationType.GENESET_SCORE); - gsvaProfile.setProfileName("gsva test platform"); - DaoGeneticProfile.addGeneticProfile(gsvaProfile); - - assertThrows(UnsupportedOperationException.class, () -> - new ImportTabDelimData(File.createTempFile("gsva", "test"), - DaoGeneticProfile.getGeneticProfileByStableId("gsva_scores").getGeneticProfileId(), - null, - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance())); - } - - /** - * Test incremental upload of GENERIC_ASSAY - */ - @Test - public void testGenericAssay() throws DaoException, IOException { - /** - * Prior checks - */ - // Stable id that is part of the platform, but absent during the incremental upload - final String absentStableId = "L-685458"; - final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); - final Set beforeStableIds = new HashSet<>(noChangeStableIds); - beforeStableIds.add(absentStableId); - - // stable_id: TCGA-A1-A0SB-01 - final int newSampleId = 1; - // stable_id: TCGA-A1-A0SD-01 - final int updateSampleId = 2; - // stable_id: TCGA-A1-A0SE-01 - final int noChangeSampleId = 3; - final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); - - GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); - assertNotNull(ic50Profile); - - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); - Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); - assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); - - File dataFolder = new File("src/test/resources/incremental/tab_delim_data/"); - File dataFile = new File(dataFolder, "data_treatment_ic50.txt"); - - /** - * Test - */ - new ImportTabDelimData( - dataFile, - null, - ic50Profile.getGeneticProfileId(), - null, - "NAME,DESCRIPTION,URL", - true, - DaoGeneticAlteration.getInstance(), - DaoGeneOptimized.getInstance()).importData(); - - /** - * After test assertions - */ - HashMap> afterResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); - assertEquals("After result should have the same amount of entries", beforeResult.size(), afterResult.size()); - afterResult.values() - .forEach(sampleToValue -> - assertEquals("Each gene row has to get one extra sample",beforeSampleIds.size() + 1, sampleToValue.size())); - assertNoChange(beforeResult, afterResult, geneStableIdsToEntityIds(noChangeStableIds), Set.of(noChangeSampleId)); - int erlotinibEntityId = geneStableIdToEntityId("Erlotinib"); - assertEquals(">8", afterResult.get(erlotinibEntityId).get(newSampleId)); - assertEquals("7.5", afterResult.get(erlotinibEntityId).get(updateSampleId)); - int irinotecanEntityId = geneStableIdToEntityId("Irinotecan"); - assertEquals("", afterResult.get(irinotecanEntityId).get(newSampleId)); - assertEquals("0.081", afterResult.get(irinotecanEntityId).get(updateSampleId)); - int absentEntityId = geneStableIdToEntityId(absentStableId); - assertEquals("", afterResult.get(absentEntityId).get(newSampleId)); - assertEquals("", afterResult.get(absentEntityId).get(updateSampleId)); - int lapatinibEntityId = geneStableIdToEntityId("Lapatinib"); - assertEquals("6.2", afterResult.get(lapatinibEntityId).get(newSampleId)); - assertEquals("7.848", afterResult.get(lapatinibEntityId).get(updateSampleId)); - assertNull("No new generic entity has been added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); - } - - @NotNull - private Set geneStableIdsToEntityIds(Set beforeStableIds) { - return beforeStableIds.stream().map(stableId -> { - try { - return geneStableIdToEntityId(stableId); - } catch (DaoException e) { - throw new RuntimeException(e); - } - }).collect(Collectors.toSet()); - } - - private int geneStableIdToEntityId(String stableId) throws DaoException { - return DaoGeneticEntity.getGeneticEntityByStableId(stableId).getId(); - } - - private void assertPriorDataState(HashMap> beforeResult, Set expectedEntityIds, Set expectedSampleIds) { - assertEquals(expectedEntityIds, beforeResult.keySet()); - beforeResult.forEach((entityId, sampleIdToValue) -> { - assertEquals("Samples for gene with entityId = " + entityId + " have to match expected ones", - expectedSampleIds, beforeResult.get(entityId).keySet()); - }); - } - - private void assertNoChange(HashMap> beforeResult, - HashMap> afterResult, - Set entityIds, - Set sampleIds) { - entityIds.forEach(entityId -> { - assertTrue("After result is expected to contain entityId=" + entityId, - afterResult.containsKey(entityId)); - sampleIds.forEach(sampleId -> { - assertTrue("Sample_id=" + sampleId + " expected to be found for gene with entityId=" + entityId, - afterResult.get(entityId).containsKey(sampleId)); - assertEquals("The values for sample_id=" + sampleId + - " and entityId=" + entityId + " before and after upload have to match.", - beforeResult.get(entityId).get(sampleId), afterResult.get(entityId).get(sampleId)); - }); - }); - } - -} diff --git a/src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt b/src/test/resources/incremental/copy_number_alteration/data_cna_discrete.txt similarity index 100% rename from src/test/resources/incremental/tab_delim_data/data_cna_discrete.txt rename to src/test/resources/incremental/copy_number_alteration/data_cna_discrete.txt diff --git a/src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt b/src/test/resources/incremental/copy_number_alteration/data_cna_pd_annotations.txt similarity index 100% rename from src/test/resources/incremental/tab_delim_data/data_cna_pd_annotations.txt rename to src/test/resources/incremental/copy_number_alteration/data_cna_pd_annotations.txt diff --git a/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt new file mode 100644 index 00000000..827c31dd --- /dev/null +++ b/src/test/resources/incremental/copy_number_alteration/meta_cna_discrete.txt @@ -0,0 +1,10 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: COPY_NUMBER_ALTERATION +datatype: DISCRETE +stable_id: gistic +show_profile_in_analysis_tab: true +profile_description: Putative copy-number from GISTIC 2.0. Values: -2 = homozygous deletion; -1 = hemizygous deletion; 0 = neutral / no change; 1 = gain; 2 = high level amplification. +profile_name: Putative copy-number alterations from GISTIC +data_filename: data_cna_discrete.txt +pd_annotations_filename: data_cna_pd_annotations.txt +namespaces: CustomNamespace diff --git a/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt b/src/test/resources/incremental/generic_assay/data_treatment_ic50.txt similarity index 83% rename from src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt rename to src/test/resources/incremental/generic_assay/data_treatment_ic50.txt index 5edb7cfa..79606fbf 100644 --- a/src/test/resources/incremental/tab_delim_data/data_treatment_ic50.txt +++ b/src/test/resources/incremental/generic_assay/data_treatment_ic50.txt @@ -4,5 +4,5 @@ Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan 0.081 # The database has this entity, but not the file #L-685458 Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib 6.2 7.848 -#The line will be skipped as the entity stable id is not in the database already +#The entity will be added LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 0.1 >~8 diff --git a/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt b/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt new file mode 100644 index 00000000..6ec6cdc5 --- /dev/null +++ b/src/test/resources/incremental/generic_assay/meta_treatment_ic50.txt @@ -0,0 +1,12 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL diff --git a/src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt b/src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt similarity index 100% rename from src/test/resources/incremental/tab_delim_data/data_expression_Zscores.txt rename to src/test/resources/incremental/mrna_expression/data_expression_Zscores.txt diff --git a/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt b/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt new file mode 100644 index 00000000..e761fed3 --- /dev/null +++ b/src/test/resources/incremental/mrna_expression/meta_expression_Zscores.txt @@ -0,0 +1,8 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: MRNA_EXPRESSION +datatype: Z-SCORE +stable_id: mrna +profile_description: Expression levels (Agilent microarray). +show_profile_in_analysis_tab: false +profile_name: mRNA expression (microarray) +data_filename: data_expression_Zscores.txt diff --git a/src/test/resources/incremental/tab_delim_data/data_rppa.txt b/src/test/resources/incremental/protein_level/data_rppa.txt similarity index 100% rename from src/test/resources/incremental/tab_delim_data/data_rppa.txt rename to src/test/resources/incremental/protein_level/data_rppa.txt diff --git a/src/test/resources/incremental/protein_level/meta_rppa.txt b/src/test/resources/incremental/protein_level/meta_rppa.txt new file mode 100644 index 00000000..f6481c7d --- /dev/null +++ b/src/test/resources/incremental/protein_level/meta_rppa.txt @@ -0,0 +1,7 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: PROTEIN_LEVEL +datatype: LOG2-VALUE +stable_id: rppa +profile_name: Test RPPA +profile_description: Test protein level data +data_filename: data_rppa.txt diff --git a/src/test/resources/seed_mini.sql b/src/test/resources/seed_mini.sql index 1222ce67..552db83e 100644 --- a/src/test/resources/seed_mini.sql +++ b/src/test/resources/seed_mini.sql @@ -334,7 +334,7 @@ INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (7,'study_tcga_pub_structural_variants',1,'STRUCTURAL_VARIANT','SV','Structural Variants','Structural Variants test data.','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (8,'study_tcga_pub_cna_long',1,'COPY_NUMBER_ALTERATION','DISCRETE_LONG','CNA values','CNA values dummy data','1'); INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (9,'study_tcga_pub_rppa',1,'PROTEIN_LEVEL','LOG2-VALUE','RPPA values','RPPA values dummy data','0'); -INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (10,'study_tcga_pub_treatment_ic50',1,'GENERIC_ASSAY','TREATMENT_RESPONSE','test treatment values','treatment values dummy data','0'); +INSERT INTO "genetic_profile" ("GENETIC_PROFILE_ID","STABLE_ID","CANCER_STUDY_ID","GENETIC_ALTERATION_TYPE","DATATYPE","NAME","DESCRIPTION","SHOW_PROFILE_IN_ANALYSIS_TAB") VALUES (10,'study_tcga_pub_treatment_ic50',1,'GENERIC_ASSAY','LIMIT-VALUE','test treatment values','treatment values dummy data','0'); -- gene_panel INSERT INTO gene_panel (INTERNAL_ID,STABLE_ID,DESCRIPTION) VALUES (1,'TESTPANEL_CNA_DISCRETE_LONG_FORMAT','Some test panel'); From e5ccc3e90959514a1b3426e4789d2bd34834f2a7 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 00:27:44 +0200 Subject: [PATCH 24/27] Move counting lines if file inside generic assay patient level data uploader --- .../portal/scripts/ImportGenericAssayPatientLevelData.java | 3 ++- .../java/org/mskcc/cbio/portal/scripts/ImportProfileData.java | 3 +-- .../scripts/TestImportGenericAssayPatientLevelData.java | 4 +--- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java index a7dda2ca..dddcc156 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportGenericAssayPatientLevelData.java @@ -79,7 +79,8 @@ public ImportGenericAssayPatientLevelData(File dataFile, String targetLine, int * @throws IOException IO Error. * @throws DaoException Database Error. */ - public void importData(int numLines) throws IOException, DaoException { + public void importData() throws IOException, DaoException { + int numLines = FileUtil.getNumLines(dataFile); geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index 10759baa..d5b6241a 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -83,7 +83,6 @@ public void run() { } // Print profile report - int numLines = FileUtil.getNumLines(dataFile); ProgressMonitor.setCurrentMessage( " --> profile id: " + geneticProfile.getGeneticProfileId() + "\n --> profile name: " + geneticProfile.getProfileName() + @@ -118,7 +117,7 @@ public void run() { String patientLevel = geneticProfile.getOtherMetaDataField("patient_level"); if (patientLevel != null && patientLevel.trim().toLowerCase().equals("true")) { ImportGenericAssayPatientLevelData genericAssayProfileImporter = new ImportGenericAssayPatientLevelData(dataFile, geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties")); - genericAssayProfileImporter.importData(numLines); + genericAssayProfileImporter.importData(); } else { // use ImportTabDelimData importer for non-patient level data ImportTabDelimData genericAssayProfileImporter = new ImportTabDelimData( diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java index 123715f8..480e9a61 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/scripts/TestImportGenericAssayPatientLevelData.java @@ -53,7 +53,6 @@ import org.mskcc.cbio.portal.model.Patient; import org.mskcc.cbio.portal.model.Sample; import org.mskcc.cbio.portal.scripts.ImportGenericAssayPatientLevelData; -import org.mskcc.cbio.portal.util.FileUtil; import org.mskcc.cbio.portal.util.ProgressMonitor; import org.springframework.test.annotation.Rollback; import org.springframework.test.context.ContextConfiguration; @@ -151,8 +150,7 @@ private void runImportGenericAssayPatientLevelData() throws DaoException, IOExce File file = new File("src/test/resources/tabDelimitedData/data_patient_generic_assay.txt"); ImportGenericAssayPatientLevelData parser = new ImportGenericAssayPatientLevelData(file, null, geneticProfileId, null, "name,description"); - int numLines = FileUtil.getNumLines(file); - parser.importData(numLines); + parser.importData(); HashMap> geneticAlterationMap = daoGeneticAlteration.getGeneticAlterationMapForEntityIds(geneticProfileId, Arrays.asList(geneticEntity1.getId(), geneticEntity2.getId())); From 472f47ec88d9056db9d3bf81410f64dcd8c4b303 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 10:13:14 +0200 Subject: [PATCH 25/27] Give error that generic asssay patient level data is not supported --- .../portal/scripts/ImportProfileData.java | 3 + .../TestIncrementalGenericAssayImporter.java | 74 +++++++++++++------ .../data_treatment_ic50_patient_level.txt | 8 ++ .../meta_treatment_ic50_patient_level.txt | 13 ++++ 4 files changed, 74 insertions(+), 24 deletions(-) create mode 100644 src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt create mode 100644 src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt diff --git a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java index d5b6241a..0e1ff058 100644 --- a/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java +++ b/src/main/java/org/mskcc/cbio/portal/scripts/ImportProfileData.java @@ -116,6 +116,9 @@ public void run() { // use a different importer for patient level data String patientLevel = geneticProfile.getOtherMetaDataField("patient_level"); if (patientLevel != null && patientLevel.trim().toLowerCase().equals("true")) { + if (overwriteExisting) { + throw new UnsupportedOperationException("Incremental upload for generic assay patient_level data is not supported. Please use sample level instead."); + } ImportGenericAssayPatientLevelData genericAssayProfileImporter = new ImportGenericAssayPatientLevelData(dataFile, geneticProfile.getTargetLine(), geneticProfile.getGeneticProfileId(), genePanel, geneticProfile.getOtherMetaDataField("generic_entity_meta_properties")); genericAssayProfileImporter.importData(); } else { diff --git a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java index 3162f6a3..e0ef8cf5 100644 --- a/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java +++ b/src/test/java/org/mskcc/cbio/portal/integrationTest/incremental/TestIncrementalGenericAssayImporter.java @@ -41,6 +41,7 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertThrows; import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertNoChange; import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.assertPriorDataState; import static org.mskcc.cbio.portal.integrationTest.incremental.GeneticAlterationsTestHelper.geneStableIdToEntityId; @@ -58,34 +59,28 @@ @Transactional public class TestIncrementalGenericAssayImporter { + // stable_id: TCGA-A1-A0SB-01 + final int newSampleId = 1; + // stable_id: TCGA-A1-A0SD-01 + final int updateSampleId = 2; + // stable_id: TCGA-A1-A0SE-01 + final int noChangeSampleId = 3; + final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); + + // Stable id that is part of the platform, but absent during the incremental upload + final String absentStableId = "L-685458"; + final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); + final Set beforeStableIds = new HashSet<>(noChangeStableIds); + { beforeStableIds.add(absentStableId); } + + private GeneticProfile ic50Profile; + private HashMap> beforeResult; + /** * Test incremental upload of GENERIC_ASSAY */ @Test - public void testGenericAssay() throws DaoException, IOException { - /** - * Prior checks - */ - // Stable id that is part of the platform, but absent during the incremental upload - final String absentStableId = "L-685458"; - final Set noChangeStableIds = Set.of("Erlotinib", "Irinotecan", "Lapatinib"); - final Set beforeStableIds = new HashSet<>(noChangeStableIds); - beforeStableIds.add(absentStableId); - - // stable_id: TCGA-A1-A0SB-01 - final int newSampleId = 1; - // stable_id: TCGA-A1-A0SD-01 - final int updateSampleId = 2; - // stable_id: TCGA-A1-A0SE-01 - final int noChangeSampleId = 3; - final Set beforeSampleIds = Set.of(updateSampleId, noChangeSampleId); - - GeneticProfile ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); - assertNotNull(ic50Profile); - - HashMap> beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); - Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); - assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); + public void testGenericAssay() throws DaoException { File dataFolder = new File("src/test/resources/incremental/generic_assay/"); File metaFile = new File(dataFolder, "meta_treatment_ic50.txt"); @@ -128,9 +123,40 @@ public void testGenericAssay() throws DaoException, IOException { assertNotNull("New generic entity has to be added", DaoGeneticEntity.getGeneticEntityByStableId("LBW242")); } + /** + * Test that incremental upload of GENERIC_ASSAY (patient level) is not supported + */ + @Test + public void testGenericAssayPatientLevel() throws DaoException { + + File dataFolder = new File("src/test/resources/incremental/generic_assay/"); + File metaFile = new File(dataFolder, "meta_treatment_ic50_patient_level.txt"); + File dataFile = new File(dataFolder, "data_treatment_ic50_patient_level.txt"); + + /** + * Test + */ + assertThrows("Incremental upload for generic assay patient_level data is not supported. Please use sample level instead.", + RuntimeException.class, () -> { + new ImportProfileData(new String[] { + "--loadMode", "bulkLoad", + "--meta", metaFile.getAbsolutePath(), + "--data", dataFile.getAbsolutePath(), + "--overwrite-existing", + }).run(); + }); + } + @Before public void setUp() throws DaoException { DaoCancerStudy.reCacheAll(); + + ic50Profile = DaoGeneticProfile.getGeneticProfileByStableId("study_tcga_pub_treatment_ic50"); + assertNotNull(ic50Profile); + + beforeResult = DaoGeneticAlteration.getInstance().getGeneticAlterationMapForEntityIds(ic50Profile.getGeneticProfileId(), null); + Set beforeEntityIds = geneStableIdsToEntityIds(beforeStableIds); + assertPriorDataState(beforeResult, beforeEntityIds, beforeSampleIds); } } diff --git a/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt b/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt new file mode 100644 index 00000000..34753bba --- /dev/null +++ b/src/test/resources/incremental/generic_assay/data_treatment_ic50_patient_level.txt @@ -0,0 +1,8 @@ +ENTITY_STABLE_ID NAME DESCRIPTION URL TCGA-A1-A0SB TCGA-A1-A0SD +Erlotinib Name of Erlotinib Desc of Erlotinib Url of Erlotinib >8 7.5 +Irinotecan Name of Irinotecan Desc of Irinotecan Url of Irinotecan 0.081 +# The database has this entity, but not the file +#L-685458 +Lapatinib Name of Lapatinib Desc of Lapatinib Url of Lapatinib 6.2 7.848 +#The entity will be added +LBW242 Name of LBW242 Desc of LBW242 Url of LBW242 0.1 >~8 diff --git a/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt b/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt new file mode 100644 index 00000000..181899f5 --- /dev/null +++ b/src/test/resources/incremental/generic_assay/meta_treatment_ic50_patient_level.txt @@ -0,0 +1,13 @@ +cancer_study_identifier: study_tcga_pub +genetic_alteration_type: GENERIC_ASSAY +generic_assay_type: TREATMENT_RESPONSE +datatype: LIMIT-VALUE +stable_id: treatment_ic50 +profile_name: IC50 values of compounds on cellular phenotype readout +profile_description: IC50 (compound concentration resulting in half maximal inhibition) of compounds on cellular phenotype readout of cultured mutant cell lines. +data_filename: data_treatment_ic50_patient_level.txt +show_profile_in_analysis_tab: true +pivot_threshold_value: 0.1 +value_sort_order: ASC +generic_entity_meta_properties: NAME,DESCRIPTION,URL +patient_level: true From c54e303ea4793cc384461288391cd318583f22bb Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Fri, 17 May 2024 14:05:24 +0200 Subject: [PATCH 26/27] Clean sample_cna_event despite whether it has alteration_driver_annotation rows or not --- src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java index 0e4ab7e8..f19bf514 100644 --- a/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java +++ b/src/main/java/org/mskcc/cbio/portal/dao/DaoCnaEvent.java @@ -129,7 +129,7 @@ public static void removeSampleCnaEvents(int cnaProfileId, List sampleI pstmt = con.prepareStatement ("DELETE sample_cna_event, alteration_driver_annotation" + " FROM sample_cna_event" + - " JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + + " LEFT JOIN alteration_driver_annotation ON alteration_driver_annotation.`ALTERATION_EVENT_ID` = sample_cna_event.`CNA_EVENT_ID`" + " WHERE sample_cna_event.`GENETIC_PROFILE_ID` = ? AND sample_cna_event.`SAMPLE_ID` IN (" + String.join(",", Collections.nCopies(sampleIds.size(), "?")) + ")"); From 18dbdd33dc98390f7285649413eabe751d5b10a6 Mon Sep 17 00:00:00 2001 From: Ruslan Forostianov Date: Tue, 28 May 2024 13:33:11 +0200 Subject: [PATCH 27/27] Fix cbioportalImport script execution args variable was not declared --- scripts/importer/cbioportalImporter.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/importer/cbioportalImporter.py b/scripts/importer/cbioportalImporter.py index ea9cfa50..c2f65cc0 100755 --- a/scripts/importer/cbioportalImporter.py +++ b/scripts/importer/cbioportalImporter.py @@ -653,5 +653,5 @@ def main(args): # ready to roll if __name__ == '__main__': - parsed_args = interface(args) + parsed_args = interface() main(parsed_args)