From 8d16493c3c4ec0e116c99c90a9b672e032f53984 Mon Sep 17 00:00:00 2001 From: Bob Date: Wed, 12 Feb 2025 22:03:38 +0000 Subject: [PATCH 01/31] WIP --- .../ai/SingleGeneAiExpressionReporter.java | 12 + .../ai/expression/ExperimentProcessor.java | 75 ++++ .../report/ai/expression/ExpressionData.java | 22 ++ .../report/ai/expression/Summarizer.java | 320 ++++++++++++++++++ 4 files changed, 429 insertions(+) create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java new file mode 100644 index 000000000..aa811cf39 --- /dev/null +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -0,0 +1,12 @@ +package org.apidb.apicommon.model.report.ai; + +import org.gusdb.wdk.model.report.AbstractReporter; +import org.apidb.apicommon.model.report.ai.expression.Summarizer; + +public class SingleGeneAiExpressionReporter extends AbstractReporter { + + // configure: is any config needed? + + // write: does the business - see SingleGeneReporter for example +} + diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java new file mode 100644 index 000000000..2ba4b6fc6 --- /dev/null +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java @@ -0,0 +1,75 @@ +package org.apidb.apicommon.model.report.ai.expression; + +import org.json.JSONArray; +import org.json.JSONObject; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public class ExperimentProcessor { + private static final Set KEYS_TO_KEEP = Set.of( + "y_axis", "description", "genus_species", "project_id", "summary", "dataset_id", + "assay_type", "x_axis", "module", "dataset_name", "display_name", "short_attribution", "paralog_number" + ); + + + public static List processExpressionData(ExpressionData expressionData) { + return processExpressionData(expressionData, 0); + } + + // for debugging only + public static List processExpressionData(ExpressionData expressionData, String datasetId) { + List experiments = processExpressionData(expressionData, 0); + return experiments.stream() + .filter(experiment -> datasetId.equals(experiment.getString("dataset_id"))) + .collect(Collectors.toList()); + } + + // maxExperiments is for dev/debugging only + public static List processExpressionData(ExpressionData expressionData, int maxExperiments) { + List experiments = new ArrayList<>(); + + for (JSONObject expressionGraph : expressionData.getExpressionGraphs()) { + String datasetId = expressionGraph.getString("dataset_id"); + + // Extract only relevant keys from expressionGraph + JSONObject experimentInfo = new JSONObject(); + for (String key : KEYS_TO_KEEP) { + if (expressionGraph.has(key)) { + experimentInfo.put(key, expressionGraph.get(key)); + } + } + + // Filter expressionGraphsDataTable to match dataset_id + List filteredData = new ArrayList<>(); + for (JSONObject entry : expressionData.getExpressionGraphsDataTable()) { + if (datasetId.equals(entry.getString("dataset_id"))) { + JSONObject dataEntry = new JSONObject(); + dataEntry.put("sample_name", entry.getString("sample_name")); + dataEntry.put("value", entry.get("value")); + if (entry.has("standard_error")) { + dataEntry.put("standard_error", entry.get("standard_error")); + } + if (entry.has("percentile_channel1")) { + dataEntry.put("percentile_channel1", entry.get("percentile_channel1")); + } + if (entry.has("percentile_channel2")) { + dataEntry.put("percentile_channel2", entry.get("percentile_channel2")); + } + filteredData.add(dataEntry); + } + } + + // Combine and store experiment data + experimentInfo.put("data", filteredData); + experiments.add(experimentInfo); + + if (maxExperiments > 0 && experiments.size() >= maxExperiments) { + break; + } + } + + return experiments; + } +} diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java new file mode 100644 index 000000000..c2e688878 --- /dev/null +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java @@ -0,0 +1,22 @@ +package org.apidb.apicommon.model.report.ai.expression; + +import org.json.JSONObject; +import java.util.List; + +public class ExpressionData { + private final List expressionGraphs; + private final List expressionGraphsDataTable; + + public ExpressionData(List expressionGraphs, List expressionGraphsDataTable) { + this.expressionGraphs = expressionGraphs; + this.expressionGraphsDataTable = expressionGraphsDataTable; + } + + public List getExpressionGraphs() { + return expressionGraphs; + } + + public List getExpressionGraphsDataTable() { + return expressionGraphsDataTable; + } +} diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java new file mode 100644 index 000000000..65117fc60 --- /dev/null +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -0,0 +1,320 @@ +package org.apidb.apicommon.model.report.ai.expression; + +// +// TO DO - add deps to pom.xml +// + +import org.json.JSONObject; +import org.json.JSONArray; +import org.json.JSONException; + +import com.openai.client.OpenAIClientAsync; +import com.openai.client.okhttp.OpenAIOkHttpClientAsync; +import com.openai.models.ChatCompletionCreateParams; +import com.openai.models.ChatModel; +import com.openai.models.ChatCompletion; +import com.openai.models.ResponseFormatJsonSchema; +import com.openai.models.ResponseFormatJsonSchema.JsonSchema; +import com.openai.core.JsonValue; +import java.util.List; +import java.util.Map; +import java.util.HashMap; +import java.util.Set; +import java.util.HashSet; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; + +public class Summarizer { + private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder() + .fromEnv() // Uses OPENAI_API_KEY from env + .maxRetries(32) // Handle 429 errors + .build(); + + // provide exact model number for semi-reproducibility + private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; + private static int MAX_RESPONSE_TOKENS = 5000; + + private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data"; + + // Prepare JSON schemas for structured responses + private static final JsonSchema.Schema experimentResponseSchema = + JsonSchema.Schema.builder() + .putAdditionalProperty("type", JsonValue.from("object")) + .putAdditionalProperty("properties", + JsonValue + .from(Map + .of( + "one_sentence_summary", Map.of("type", "string"), + "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5), + "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5), + "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), + "notes", Map.of("type", "string") + ) + ) + ) + .putAdditionalProperty("required", + JsonValue.from( + List.of( + "one_sentence_summary", + "biological_importance", + "confidence", + "experiment_keywords", + "notes") + ) + ) + .build(); + + private static final JsonSchema.Schema finalResponseSchema = + JsonSchema.Schema.builder() + .putAdditionalProperty("type", JsonValue.from("object")) + .putAdditionalProperty("properties", + JsonValue + .from(Map + .of( + "headline", Map.of("type", "string"), + "one_paragraph_summary", Map.of("type", "string"), + "sections", + Map.of("type", "array", + "minimum", 1, + "items", + Map.of( + "type", "object", + "required", List.of("headline", "one_sentence_summary", "dataset_ids"), + "properties", + Map.of( + "headline", Map.of("type", "string"), + "one_sentence_summary", Map.of("type", "string"), + "dataset_ids", Map.of("type", "array", + "items", Map.of("type", "string")) + ) + ) + ) + ) + ) + ) + .putAdditionalProperty("required", + JsonValue.from( + List.of( + "headline", + "one_paragraph_summary", + "dataset_ids" + ) + ) + ) + .build(); + + public static JSONObject summariseExpression(String geneId, String projectId, String serviceBaseUrl) { + System.out.println("Summarising expression for Gene ID: " + geneId + " with model: " + model.toString()); + + // Placeholder for the actual implementation + System.out.println("Fetching data from: " + serviceBaseUrl); + + try { + // Call the API client to fetch expression data + ExpressionData expressionData = WdkClient.fetchExpressionData(serviceBaseUrl, geneId, projectId); + + // Print retrieved data (debugging) + System.out.println("Expression Graphs: " + expressionData.getExpressionGraphs().size()); + System.out.println("Expression Graphs Data Table: " + expressionData.getExpressionGraphsDataTable().size()); + + // Process expression data further into a list of pruned metadata plus data + List experimentsWithData = ExperimentProcessor.processExpressionData(expressionData); + System.out.println("Pre-processed Experiments: " + experimentsWithData.size()); + + // Send AI requests in parallel + // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI + List> aiRequests = experimentsWithData.stream() + .map(Summarizer::sendExperimentToOpenAI) + .collect(Collectors.toList()); + // Wait for all requests to complete + List responses = aiRequests.stream() + .map(CompletableFuture::join) // Blocks until each completes + .collect(Collectors.toList()); + + // Debug output + // System.out.println("Individual responses:"); + // responses.forEach(response -> System.out.println(response.toString(2))); + // System.exit(0); + + JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses); + return finalSummary; + + } catch (Exception e) { + // Handle errors gracefully + System.err.println("Error fetching expression data: " + e.getMessage()); + e.printStackTrace(); // Print stack trace for debugging + } + + return null; + } + + private static CompletableFuture sendExperimentToOpenAI(JSONObject experiment) { + + // Possible TO DO: AI EDIT DESCRIPTION + // Before sending the experiment+data to the AI, ask the AI to edit the `description` field + // as follows: (This should be cached by dataset_id only and would be called once per organism + // and would reduce tokens and "cognitive load" a little bit for the next step.) + // + // "Edit the following text to so that it **only** describes the experimental design of the transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, especially not any genes or groups of genes and their expression behaviour." + // + // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below. + + + + // We don't need to send the dataset_id to the AI but it's useful to have in the response for phase two + JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone + String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null; + experimentForAI.remove("dataset_id"); + +// specific experimental fixes for "DS_2e639b71f6" +// experimentForAI.put("display_name", "Transcriptional profiling of male head comparing swarming mosquito with control non-swarming mosquito (Anopheles coluzzii, aligned to A. gambiae PEST strain)"); +// experimentForAI.put("y_axis", "Expression Values for 2 channel microarray experiments are log ratios."); +// experimentForAI.put("description", "Anopheles coluzzii mosquitoes were collected in July in Vallée du Kou, Bobo-Dioulasso, Burkina Faso in 2011. Mosquitoes, mostly males, were collected in swarms using sweeping net during dusk. The indoor resting (nonswarming) males with antennal fibrillae becoming erect were collected in inhabited houses using vacuum aspiration just prior to swarming time. The collected mosquitoes were placed in tubes containing RNAlater to prevent RNA degradation. Mosquito species was molecularly identified by SINE-PCR. Total RNA from 50 male mosquito heads was isolated. Both swarm male heads and indoor resting male heads were used as samples. Laboratory reared 2-6-day old virgin An. gambiae s.s. male heads were used as reference samples (control). Three biological replicates were performed for each group.    \nMicroarray analysis: Cy5- and Cy3-labeled cRNA probes were generated from 200 ng of RNA using Agilent Technologies Low Input Quick Amp Labeling Kit according to the manufacturer's instructions. Probe hybridization to the microarray slides was performed with 2 μg cRNA probes.   "); + + String message = """ +The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata: + +```json +%s +``` + +**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary. +**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments. +**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields. +""".formatted(experimentForAI.toString()); + +// System.out.println(message); /// DEBUG + + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() + .model(model) + .maxCompletionTokens(MAX_RESPONSE_TOKENS) + .responseFormat(ResponseFormatJsonSchema.builder() + .jsonSchema(JsonSchema.builder() + .name("experiment-summary") + .schema(experimentResponseSchema) + .build()) + .build()) + .addSystemMessage(systemMessage) + .addUserMessage(message) + // .temperature(1.0) + .build(); + + // add dataset_id back to the response + return openAIClient.chat().completions().create(request) + .thenApply(completion -> { + // response is a JSON string + String jsonString = completion.choices().get(0).message().content().get(); + try { + JSONObject jsonObject = new JSONObject(jsonString); + jsonObject.put("dataset_id", datasetId); + return jsonObject; + } catch (JSONException e) { + System.err.println("Error parsing JSON response for dataset " + datasetId + ": " + e.getMessage()); + System.err.println("Raw response: " + jsonString); + return new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId); + } + }); + } + + + private static JSONObject sendExperimentSummariesToOpenAI(List experiments) { + + String message = """ +Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format: + +```json +%s +``` + +Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response. +""".formatted(new JSONArray(experiments)); + + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() + .model(model) + .maxCompletionTokens(MAX_RESPONSE_TOKENS) + .responseFormat(ResponseFormatJsonSchema.builder() + .jsonSchema(JsonSchema.builder() + .name("expression-summary") + .schema(finalResponseSchema) + .build()) + .build()) + .addSystemMessage(systemMessage) + .addUserMessage(message) + .build(); + + // System.out.println(message); + + ChatCompletion completion = openAIClient.chat().completions().create(request).join(); // join() waits for the async response + String jsonString = completion.choices().get(0).message().content().get(); + JSONObject rawResponseObject = new JSONObject(jsonString); + + // TO DO - quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI + JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments); + + return finalResponseObject; + } + + + public static JSONObject consolidateSummary(JSONObject summaryResponse, List individualResults) { + // Gather all dataset IDs from individualResults and map them to summaries + Map datasetSummaries = new HashMap<>(); + for (JSONObject result : individualResults) { + datasetSummaries.put(result.getString("dataset_id"), result); + } + + Set seenDatasetIds = new HashSet<>(); + JSONArray deduplicatedSections = new JSONArray(); + JSONArray sections = summaryResponse.getJSONArray("sections"); + + for (int i = 0; i < sections.length(); i++) { + JSONObject section = sections.getJSONObject(i); + JSONArray datasetIds = section.getJSONArray("dataset_ids"); + JSONArray summaries = new JSONArray(); + + for (int j = 0; j < datasetIds.length(); j++) { + String id = datasetIds.getString(j); + + // Warn and skip if the id doesn't exist + if (!datasetSummaries.containsKey(id)) { + System.out.println("WARNING: summary section id '" + id + "' does not exist. Excluding from final output."); + continue; + } + // Skip if we've seen it + if (seenDatasetIds.contains(id)) continue; + + seenDatasetIds.add(id); + summaries.put(datasetSummaries.get(id)); + } + + // Update section with mapped summaries and remove dataset_ids key + section.put("summaries", summaries); + section.remove("dataset_ids"); + deduplicatedSections.put(section); + } + + // Find missing dataset IDs + Set missingDatasetIds = new HashSet<>(datasetSummaries.keySet()); + missingDatasetIds.removeAll(seenDatasetIds); + + // If there are missing IDs, add an "Others" section + if (!missingDatasetIds.isEmpty()) { + JSONArray otherSummaries = new JSONArray(); + for (String id : missingDatasetIds) { + otherSummaries.put(datasetSummaries.get(id)); + } + + JSONObject otherSection = new JSONObject(); + otherSection.put("headline", "Other"); + otherSection.put("one_sentence_summary", "These experiments were not grouped into sub-sections by the AI."); + otherSection.put("summaries", otherSummaries); + deduplicatedSections.put(otherSection); + } + + // Create final deduplicated summary + JSONObject finalSummary = new JSONObject(summaryResponse.toString()); + finalSummary.put("sections", deduplicatedSections); + return finalSummary; + } + +} + From f85f2a03173b9a1997db2148351335d3a48874d1 Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 13 Feb 2025 20:47:07 +0000 Subject: [PATCH 02/31] it compiles - at least --- Model/pom.xml | 8 +++ .../ai/SingleGeneAiExpressionReporter.java | 49 +++++++++++++++++-- .../report/ai/expression/Summarizer.java | 45 +++++------------ 3 files changed, 65 insertions(+), 37 deletions(-) diff --git a/Model/pom.xml b/Model/pom.xml index c216d7981..0d1bbe5dd 100644 --- a/Model/pom.xml +++ b/Model/pom.xml @@ -134,6 +134,14 @@ io.vulpine.lib Jackfish + + + + com.openai + openai-java + 0.22.0 + + diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index aa811cf39..06e18d3d3 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -1,12 +1,55 @@ package org.apidb.apicommon.model.report.ai; import org.gusdb.wdk.model.report.AbstractReporter; +import org.gusdb.wdk.model.report.Reporter; +import org.gusdb.wdk.model.report.ReporterConfigException; import org.apidb.apicommon.model.report.ai.expression.Summarizer; +import org.gusdb.wdk.model.WdkModelException; -public class SingleGeneAiExpressionReporter extends AbstractReporter { +import org.json.JSONObject; +import java.io.IOException; +import java.io.OutputStream; - // configure: is any config needed? +public class SingleGeneAiExpressionReporter extends AbstractReporter { + + private enum CacheMode { + TEST("test"), + POPULATE("populate"); + private final String mode; + CacheMode(String mode) { + this.mode = mode; + } + public String getMode() { + return mode; + } + public static CacheMode fromString(String mode) throws IllegalArgumentException { + for (CacheMode cm : CacheMode.values()) { + if (cm.mode.equalsIgnoreCase(mode)) { + return cm; + } + } + throw new IllegalArgumentException("Invalid CacheMode: " + mode); + } + } + + private CacheMode _cacheMode = CacheMode.TEST; + + @Override + public Reporter configure(JSONObject config) throws ReporterConfigException, WdkModelException { + try { + _cacheMode = CacheMode.fromString(config.getString("cacheMode")); + } catch (IllegalArgumentException e) { + throw new ReporterConfigException("Invalid cacheMode value: " + config.getString("cacheMode"), e); + } + return this; + } + + @Override + protected void write(OutputStream out) throws IOException, WdkModelException { + + } + - // write: does the business - see SingleGeneReporter for example } + diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 65117fc60..d94a03e58 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -103,16 +103,9 @@ public class Summarizer { ) .build(); - public static JSONObject summariseExpression(String geneId, String projectId, String serviceBaseUrl) { - System.out.println("Summarising expression for Gene ID: " + geneId + " with model: " + model.toString()); + public static JSONObject summariseExpression(ExpressionData expressionData) { - // Placeholder for the actual implementation - System.out.println("Fetching data from: " + serviceBaseUrl); - try { - // Call the API client to fetch expression data - ExpressionData expressionData = WdkClient.fetchExpressionData(serviceBaseUrl, geneId, projectId); - // Print retrieved data (debugging) System.out.println("Expression Graphs: " + expressionData.getExpressionGraphs().size()); System.out.println("Expression Graphs Data Table: " + expressionData.getExpressionGraphsDataTable().size()); @@ -161,27 +154,17 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e - // We don't need to send the dataset_id to the AI but it's useful to have in the response for phase two + // We don't need to send the dataset_id to the AI but it's useful to have in the + // response for phase two - so we save it for later JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null; experimentForAI.remove("dataset_id"); - -// specific experimental fixes for "DS_2e639b71f6" -// experimentForAI.put("display_name", "Transcriptional profiling of male head comparing swarming mosquito with control non-swarming mosquito (Anopheles coluzzii, aligned to A. gambiae PEST strain)"); -// experimentForAI.put("y_axis", "Expression Values for 2 channel microarray experiments are log ratios."); -// experimentForAI.put("description", "Anopheles coluzzii mosquitoes were collected in July in Vallée du Kou, Bobo-Dioulasso, Burkina Faso in 2011. Mosquitoes, mostly males, were collected in swarms using sweeping net during dusk. The indoor resting (nonswarming) males with antennal fibrillae becoming erect were collected in inhabited houses using vacuum aspiration just prior to swarming time. The collected mosquitoes were placed in tubes containing RNAlater to prevent RNA degradation. Mosquito species was molecularly identified by SINE-PCR. Total RNA from 50 male mosquito heads was isolated. Both swarm male heads and indoor resting male heads were used as samples. Laboratory reared 2-6-day old virgin An. gambiae s.s. male heads were used as reference samples (control). Three biological replicates were performed for each group.    \nMicroarray analysis: Cy5- and Cy3-labeled cRNA probes were generated from 200 ng of RNA using Agilent Technologies Low Input Quick Amp Labeling Kit according to the manufacturer's instructions. Probe hybridization to the microarray slides was performed with 2 μg cRNA probes.   "); - String message = """ -The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata: - -```json -%s -``` - -**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary. -**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments. -**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields. -""".formatted(experimentForAI.toString()); + String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" + + "```json\n%s\n```\n\n".formatted(experimentForAI.toString()) + + "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" + + "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" + + "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields."; // System.out.println(message); /// DEBUG @@ -219,15 +202,9 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e private static JSONObject sendExperimentSummariesToOpenAI(List experiments) { - String message = """ -Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format: - -```json -%s -``` - -Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response. -""".formatted(new JSONArray(experiments)); + String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + + "```json\n%s\n```\n\n".formatted(new JSONArray(experiments)) + + "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response."; ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() .model(model) From 089511c5203031143fc162bbbef28868c110efae Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 13 Feb 2025 23:11:52 +0000 Subject: [PATCH 03/31] all wired together and compiles --- .../ai/SingleGeneAiExpressionReporter.java | 33 +- .../ai/expression/ExperimentProcessor.java | 75 --- .../ai/expression/GeneRecordProcessor.java | 83 ++++ .../report/ai/expression/Summarizer.java | 450 +++++++++--------- 4 files changed, 337 insertions(+), 304 deletions(-) delete mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index 06e18d3d3..f424f3c66 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -5,14 +5,25 @@ import org.gusdb.wdk.model.report.ReporterConfigException; import org.apidb.apicommon.model.report.ai.expression.Summarizer; import org.gusdb.wdk.model.WdkModelException; +import org.gusdb.wdk.model.record.RecordClass; +import org.gusdb.wdk.model.record.RecordInstance; +import org.gusdb.wdk.model.answer.stream.RecordStream; +import org.gusdb.wdk.model.answer.stream.RecordStreamFactory; +import org.apidb.apicommon.model.TranscriptUtil; +import org.gusdb.wdk.model.record.TableField; +import org.gusdb.wdk.model.WdkModelException; +import org.gusdb.wdk.model.WdkUserException; import org.json.JSONObject; import java.io.IOException; import java.io.OutputStream; +import java.util.Map; +import java.util.List; +import java.util.stream.Collectors; public class SingleGeneAiExpressionReporter extends AbstractReporter { - private enum CacheMode { + public enum CacheMode { TEST("test"), POPULATE("populate"); private final String mode; @@ -37,7 +48,9 @@ public static CacheMode fromString(String mode) throws IllegalArgumentException @Override public Reporter configure(JSONObject config) throws ReporterConfigException, WdkModelException { try { - _cacheMode = CacheMode.fromString(config.getString("cacheMode")); + if (config.has("cacheMode")) { + _cacheMode = CacheMode.fromString(config.getString("cacheMode")); + } } catch (IllegalArgumentException e) { throw new ReporterConfigException("Invalid cacheMode value: " + config.getString("cacheMode"), e); } @@ -46,7 +59,23 @@ public Reporter configure(JSONObject config) throws ReporterConfigException, Wdk @Override protected void write(OutputStream out) throws IOException, WdkModelException { + RecordClass geneRecordClass = TranscriptUtil.getGeneRecordClass(_wdkModel); + Map tableFields = geneRecordClass.getTableFieldMap(); + List tables = List.of("ExpressionGraphs", "ExpressionGraphsDataTable").stream() + .map(name -> tableFields.get(name)) + .collect(Collectors.toList()); + + try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables)) { + RecordInstance singleRecord = recordStream.iterator().next(); + JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord); + out.write(expressionSummary.toString().getBytes()); + out.flush(); + } + catch (WdkUserException e) { + throw new WdkModelException(e); + } + } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java deleted file mode 100644 index 2ba4b6fc6..000000000 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java +++ /dev/null @@ -1,75 +0,0 @@ -package org.apidb.apicommon.model.report.ai.expression; - -import org.json.JSONArray; -import org.json.JSONObject; -import java.util.ArrayList; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; - -public class ExperimentProcessor { - private static final Set KEYS_TO_KEEP = Set.of( - "y_axis", "description", "genus_species", "project_id", "summary", "dataset_id", - "assay_type", "x_axis", "module", "dataset_name", "display_name", "short_attribution", "paralog_number" - ); - - - public static List processExpressionData(ExpressionData expressionData) { - return processExpressionData(expressionData, 0); - } - - // for debugging only - public static List processExpressionData(ExpressionData expressionData, String datasetId) { - List experiments = processExpressionData(expressionData, 0); - return experiments.stream() - .filter(experiment -> datasetId.equals(experiment.getString("dataset_id"))) - .collect(Collectors.toList()); - } - - // maxExperiments is for dev/debugging only - public static List processExpressionData(ExpressionData expressionData, int maxExperiments) { - List experiments = new ArrayList<>(); - - for (JSONObject expressionGraph : expressionData.getExpressionGraphs()) { - String datasetId = expressionGraph.getString("dataset_id"); - - // Extract only relevant keys from expressionGraph - JSONObject experimentInfo = new JSONObject(); - for (String key : KEYS_TO_KEEP) { - if (expressionGraph.has(key)) { - experimentInfo.put(key, expressionGraph.get(key)); - } - } - - // Filter expressionGraphsDataTable to match dataset_id - List filteredData = new ArrayList<>(); - for (JSONObject entry : expressionData.getExpressionGraphsDataTable()) { - if (datasetId.equals(entry.getString("dataset_id"))) { - JSONObject dataEntry = new JSONObject(); - dataEntry.put("sample_name", entry.getString("sample_name")); - dataEntry.put("value", entry.get("value")); - if (entry.has("standard_error")) { - dataEntry.put("standard_error", entry.get("standard_error")); - } - if (entry.has("percentile_channel1")) { - dataEntry.put("percentile_channel1", entry.get("percentile_channel1")); - } - if (entry.has("percentile_channel2")) { - dataEntry.put("percentile_channel2", entry.get("percentile_channel2")); - } - filteredData.add(dataEntry); - } - } - - // Combine and store experiment data - experimentInfo.put("data", filteredData); - experiments.add(experimentInfo); - - if (maxExperiments > 0 && experiments.size() >= maxExperiments) { - break; - } - } - - return experiments; - } -} diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java new file mode 100644 index 000000000..0bb32c745 --- /dev/null +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -0,0 +1,83 @@ +package org.apidb.apicommon.model.report.ai.expression; + +import org.gusdb.wdk.model.record.RecordInstance; +import org.gusdb.wdk.model.record.TableValue; +import org.gusdb.wdk.model.record.TableValueRow; +import org.gusdb.wdk.model.WdkUserException; +import org.gusdb.wdk.model.WdkModelException; + +import org.json.JSONArray; +import org.json.JSONObject; +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +/** + * expects a geneRecord with two tables: "ExpressionGraphs" and "ExpressionGraphsDataTable" + * + * returns a list of JSON Objects of data ready to feed the AI + */ + +public class GeneRecordProcessor { + private static final Set KEYS_TO_KEEP = + Set.of( + "y_axis", "description", "genus_species", "project_id", "summary", "dataset_id", + "assay_type", "x_axis", "module", "dataset_name", "display_name", "short_attribution", "paralog_number" + ); + + public static List processExpressionData(RecordInstance geneRecord) throws WdkModelException, WdkUserException { + return processExpressionData(geneRecord, 0); + } + + // for debugging only + public static List processExpressionData(RecordInstance geneRecord, String datasetId) throws WdkModelException, WdkUserException { + List experiments = processExpressionData(geneRecord, 0); + return experiments.stream() + .filter(experiment -> datasetId.equals(experiment.getString("dataset_id"))) + .collect(Collectors.toList()); + } + + // maxExperiments is for dev/debugging only + public static List processExpressionData(RecordInstance geneRecord, int maxExperiments) throws WdkModelException, WdkUserException { + // return value: + List experiments = new ArrayList<>(); + + TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs"); + TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable"); + + for (TableValueRow experimentRow : expressionGraphs) { + JSONObject experimentInfo = new JSONObject(); + + // Extract all relevant attributes + for (String key : KEYS_TO_KEEP) { + experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue()); + } + + List filteredData = new ArrayList<>(); + String datasetId = experimentRow.getAttributeValue("dataset_id").getValue(); + // add data from `expressionGraphsDataTable` where attribute "dataset_id" equals `datasetId` + List thisExperimentDataRows = new ArrayList<>(); + for (TableValueRow dataRow : expressionGraphsDataTable) { + if (dataRow.getAttributeValue("dataset_id").getValue().equals(datasetId)) { + JSONObject dataEntry = new JSONObject(); + + // Extract relevant numeric fields + List dataKeys = List.of("value", "standard_error", "percentile_channel1", "percentile_channel2", "sample_name"); + for (String key : dataKeys) { + dataEntry.put(key, dataRow.getAttributeValue(key).getValue()); + } + + filteredData.add(dataEntry); + } + } + + experimentInfo.put("data", filteredData); + experiments.add(experimentInfo); + + if (maxExperiments > 0 && experiments.size() >= maxExperiments) break; + } + return experiments; + } +} diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index d94a03e58..98961b7f1 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -1,8 +1,8 @@ package org.apidb.apicommon.model.report.ai.expression; -// -// TO DO - add deps to pom.xml -// +import org.gusdb.wdk.model.record.RecordInstance; +import org.gusdb.wdk.model.WdkUserException; +import org.gusdb.wdk.model.WdkModelException; import org.json.JSONObject; import org.json.JSONArray; @@ -25,273 +25,269 @@ import java.util.stream.Collectors; public class Summarizer { - private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder() - .fromEnv() // Uses OPENAI_API_KEY from env - .maxRetries(32) // Handle 429 errors - .build(); - - // provide exact model number for semi-reproducibility - private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; - private static int MAX_RESPONSE_TOKENS = 5000; + private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder() + .fromEnv() // Uses OPENAI_API_KEY from env + .maxRetries(32) // Handle 429 errors + .build(); + + // provide exact model number for semi-reproducibility + private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; + private static int MAX_RESPONSE_TOKENS = 5000; - private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data"; - - // Prepare JSON schemas for structured responses - private static final JsonSchema.Schema experimentResponseSchema = - JsonSchema.Schema.builder() - .putAdditionalProperty("type", JsonValue.from("object")) - .putAdditionalProperty("properties", - JsonValue - .from(Map - .of( - "one_sentence_summary", Map.of("type", "string"), - "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5), - "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5), - "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), - "notes", Map.of("type", "string") - ) - ) - ) - .putAdditionalProperty("required", - JsonValue.from( - List.of( - "one_sentence_summary", - "biological_importance", - "confidence", - "experiment_keywords", - "notes") - ) - ) - .build(); - - private static final JsonSchema.Schema finalResponseSchema = - JsonSchema.Schema.builder() - .putAdditionalProperty("type", JsonValue.from("object")) - .putAdditionalProperty("properties", - JsonValue - .from(Map - .of( - "headline", Map.of("type", "string"), - "one_paragraph_summary", Map.of("type", "string"), - "sections", - Map.of("type", "array", - "minimum", 1, - "items", - Map.of( - "type", "object", - "required", List.of("headline", "one_sentence_summary", "dataset_ids"), - "properties", - Map.of( - "headline", Map.of("type", "string"), - "one_sentence_summary", Map.of("type", "string"), - "dataset_ids", Map.of("type", "array", - "items", Map.of("type", "string")) - ) - ) - ) - ) - ) - ) - .putAdditionalProperty("required", - JsonValue.from( - List.of( - "headline", - "one_paragraph_summary", - "dataset_ids" - ) - ) - ) - .build(); + private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data"; + + // Prepare JSON schemas for structured responses + // NOTE: this code is horrible to look at/read. It would be better to just define the schemas as JSON strings + // but this is only really nice when we have """ text block """ support, coming soon when we upgrade, perhaps? + private static final JsonSchema.Schema experimentResponseSchema = + JsonSchema.Schema.builder() + .putAdditionalProperty("type", JsonValue.from("object")) + .putAdditionalProperty("properties", + JsonValue + .from(Map + .of( + "one_sentence_summary", Map.of("type", "string"), + "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5), + "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5), + "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), + "notes", Map.of("type", "string") + ) + ) + ) + .putAdditionalProperty("required", + JsonValue.from( + List.of( + "one_sentence_summary", + "biological_importance", + "confidence", + "experiment_keywords", + "notes") + ) + ) + .build(); + + private static final JsonSchema.Schema finalResponseSchema = + JsonSchema.Schema.builder() + .putAdditionalProperty("type", JsonValue.from("object")) + .putAdditionalProperty("properties", + JsonValue + .from(Map + .of( + "headline", Map.of("type", "string"), + "one_paragraph_summary", Map.of("type", "string"), + "sections", + Map.of("type", "array", + "minimum", 1, + "items", + Map.of( + "type", "object", + "required", List.of("headline", "one_sentence_summary", "dataset_ids"), + "properties", + Map.of( + "headline", Map.of("type", "string"), + "one_sentence_summary", Map.of("type", "string"), + "dataset_ids", Map.of("type", "array", + "items", Map.of("type", "string")) + ) + ) + ) + ) + ) + ) + .putAdditionalProperty("required", + JsonValue.from( + List.of( + "headline", + "one_paragraph_summary", + "dataset_ids" + ) + ) + ) + .build(); - public static JSONObject summariseExpression(ExpressionData expressionData) { + public static JSONObject summarizeExpression(RecordInstance geneRecord) throws WdkUserException { - try { - // Print retrieved data (debugging) - System.out.println("Expression Graphs: " + expressionData.getExpressionGraphs().size()); - System.out.println("Expression Graphs Data Table: " + expressionData.getExpressionGraphsDataTable().size()); - - // Process expression data further into a list of pruned metadata plus data - List experimentsWithData = ExperimentProcessor.processExpressionData(expressionData); - System.out.println("Pre-processed Experiments: " + experimentsWithData.size()); + try { + // Process expression data further into a list of pruned metadata plus data + List experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord); + System.out.println("Pre-processed Experiments: " + experimentsWithData.size()); - // Send AI requests in parallel + // Send AI requests in parallel // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI - List> aiRequests = experimentsWithData.stream() - .map(Summarizer::sendExperimentToOpenAI) - .collect(Collectors.toList()); - // Wait for all requests to complete - List responses = aiRequests.stream() - .map(CompletableFuture::join) // Blocks until each completes - .collect(Collectors.toList()); - - // Debug output + List> aiRequests = experimentsWithData.stream() + .map(Summarizer::sendExperimentToOpenAI) + .collect(Collectors.toList()); + // Wait for all requests to complete + List responses = aiRequests.stream() + .map(CompletableFuture::join) // Blocks until each completes + .collect(Collectors.toList()); + + // Debug output // System.out.println("Individual responses:"); - // responses.forEach(response -> System.out.println(response.toString(2))); + // responses.forEach(response -> System.out.println(response.toString(2))); // System.exit(0); JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses); return finalSummary; - } catch (Exception e) { - // Handle errors gracefully - System.err.println("Error fetching expression data: " + e.getMessage()); - e.printStackTrace(); // Print stack trace for debugging - } - - return null; + } catch (WdkModelException e) { + // Handle errors gracefully + System.err.println("Error fetching expression data: " + e.getMessage()); + throw new WdkUserException(e); } + } - private static CompletableFuture sendExperimentToOpenAI(JSONObject experiment) { + private static CompletableFuture sendExperimentToOpenAI(JSONObject experiment) { - // Possible TO DO: AI EDIT DESCRIPTION - // Before sending the experiment+data to the AI, ask the AI to edit the `description` field - // as follows: (This should be cached by dataset_id only and would be called once per organism - // and would reduce tokens and "cognitive load" a little bit for the next step.) - // - // "Edit the following text to so that it **only** describes the experimental design of the transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, especially not any genes or groups of genes and their expression behaviour." - // - // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below. + // Possible TO DO: AI EDIT DESCRIPTION + // Before sending the experiment+data to the AI, ask the AI to edit the `description` field + // as follows: (This should be cached by dataset_id only and would be called once per organism + // and would reduce tokens and "cognitive load" a little bit for the next step.) + // + // "Edit the following text to so that it **only** describes the experimental design of the transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, especially not any genes or groups of genes and their expression behaviour." + // + // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below. - // We don't need to send the dataset_id to the AI but it's useful to have in the - // response for phase two - so we save it for later - JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone - String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null; - experimentForAI.remove("dataset_id"); + // We don't need to send the dataset_id to the AI but it's useful to have in the + // response for phase two - so we save it for later + JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone + String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null; + experimentForAI.remove("dataset_id"); - String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" + + String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" + "```json\n%s\n```\n\n".formatted(experimentForAI.toString()) + "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" + "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" + "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields."; -// System.out.println(message); /// DEBUG - - ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() - .model(model) - .maxCompletionTokens(MAX_RESPONSE_TOKENS) - .responseFormat(ResponseFormatJsonSchema.builder() - .jsonSchema(JsonSchema.builder() - .name("experiment-summary") - .schema(experimentResponseSchema) - .build()) - .build()) - .addSystemMessage(systemMessage) - .addUserMessage(message) + // System.out.println(message); /// DEBUG + + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() + .model(model) + .maxCompletionTokens(MAX_RESPONSE_TOKENS) + .responseFormat(ResponseFormatJsonSchema.builder() + .jsonSchema(JsonSchema.builder() + .name("experiment-summary") + .schema(experimentResponseSchema) + .build()) + .build()) + .addSystemMessage(systemMessage) + .addUserMessage(message) // .temperature(1.0) - .build(); + .build(); - // add dataset_id back to the response - return openAIClient.chat().completions().create(request) + // add dataset_id back to the response + return openAIClient.chat().completions().create(request) .thenApply(completion -> { - // response is a JSON string - String jsonString = completion.choices().get(0).message().content().get(); - try { - JSONObject jsonObject = new JSONObject(jsonString); - jsonObject.put("dataset_id", datasetId); - return jsonObject; - } catch (JSONException e) { - System.err.println("Error parsing JSON response for dataset " + datasetId + ": " + e.getMessage()); - System.err.println("Raw response: " + jsonString); - return new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId); - } - }); - } - - - private static JSONObject sendExperimentSummariesToOpenAI(List experiments) { + // response is a JSON string + String jsonString = completion.choices().get(0).message().content().get(); + try { + JSONObject jsonObject = new JSONObject(jsonString); + jsonObject.put("dataset_id", datasetId); + return jsonObject; + } catch (JSONException e) { + System.err.println("Error parsing JSON response for dataset " + datasetId + ": " + e.getMessage()); + System.err.println("Raw response: " + jsonString); + return new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId); + } + }); + } + + + private static JSONObject sendExperimentSummariesToOpenAI(List experiments) { - String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + + String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + "```json\n%s\n```\n\n".formatted(new JSONArray(experiments)) + "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response."; - ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() - .model(model) - .maxCompletionTokens(MAX_RESPONSE_TOKENS) - .responseFormat(ResponseFormatJsonSchema.builder() - .jsonSchema(JsonSchema.builder() - .name("expression-summary") - .schema(finalResponseSchema) - .build()) - .build()) - .addSystemMessage(systemMessage) - .addUserMessage(message) - .build(); - - // System.out.println(message); - - ChatCompletion completion = openAIClient.chat().completions().create(request).join(); // join() waits for the async response - String jsonString = completion.choices().get(0).message().content().get(); - JSONObject rawResponseObject = new JSONObject(jsonString); - - // TO DO - quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI - JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments); + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() + .model(model) + .maxCompletionTokens(MAX_RESPONSE_TOKENS) + .responseFormat(ResponseFormatJsonSchema.builder() + .jsonSchema(JsonSchema.builder() + .name("expression-summary") + .schema(finalResponseSchema) + .build()) + .build()) + .addSystemMessage(systemMessage) + .addUserMessage(message) + .build(); + + // System.out.println(message); + + ChatCompletion completion = openAIClient.chat().completions().create(request).join(); // join() waits for the async response + String jsonString = completion.choices().get(0).message().content().get(); + JSONObject rawResponseObject = new JSONObject(jsonString); + + // TO DO - quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI + JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments); - return finalResponseObject; - } + return finalResponseObject; + } - public static JSONObject consolidateSummary(JSONObject summaryResponse, List individualResults) { - // Gather all dataset IDs from individualResults and map them to summaries - Map datasetSummaries = new HashMap<>(); - for (JSONObject result : individualResults) { - datasetSummaries.put(result.getString("dataset_id"), result); - } + public static JSONObject consolidateSummary(JSONObject summaryResponse, List individualResults) { + // Gather all dataset IDs from individualResults and map them to summaries + Map datasetSummaries = new HashMap<>(); + for (JSONObject result : individualResults) { + datasetSummaries.put(result.getString("dataset_id"), result); + } - Set seenDatasetIds = new HashSet<>(); - JSONArray deduplicatedSections = new JSONArray(); - JSONArray sections = summaryResponse.getJSONArray("sections"); + Set seenDatasetIds = new HashSet<>(); + JSONArray deduplicatedSections = new JSONArray(); + JSONArray sections = summaryResponse.getJSONArray("sections"); - for (int i = 0; i < sections.length(); i++) { - JSONObject section = sections.getJSONObject(i); - JSONArray datasetIds = section.getJSONArray("dataset_ids"); - JSONArray summaries = new JSONArray(); + for (int i = 0; i < sections.length(); i++) { + JSONObject section = sections.getJSONObject(i); + JSONArray datasetIds = section.getJSONArray("dataset_ids"); + JSONArray summaries = new JSONArray(); - for (int j = 0; j < datasetIds.length(); j++) { - String id = datasetIds.getString(j); + for (int j = 0; j < datasetIds.length(); j++) { + String id = datasetIds.getString(j); - // Warn and skip if the id doesn't exist - if (!datasetSummaries.containsKey(id)) { - System.out.println("WARNING: summary section id '" + id + "' does not exist. Excluding from final output."); - continue; - } - // Skip if we've seen it - if (seenDatasetIds.contains(id)) continue; + // Warn and skip if the id doesn't exist + if (!datasetSummaries.containsKey(id)) { + System.out.println("WARNING: summary section id '" + id + "' does not exist. Excluding from final output."); + continue; + } + // Skip if we've seen it + if (seenDatasetIds.contains(id)) continue; - seenDatasetIds.add(id); - summaries.put(datasetSummaries.get(id)); - } + seenDatasetIds.add(id); + summaries.put(datasetSummaries.get(id)); + } - // Update section with mapped summaries and remove dataset_ids key - section.put("summaries", summaries); - section.remove("dataset_ids"); - deduplicatedSections.put(section); - } + // Update section with mapped summaries and remove dataset_ids key + section.put("summaries", summaries); + section.remove("dataset_ids"); + deduplicatedSections.put(section); + } - // Find missing dataset IDs - Set missingDatasetIds = new HashSet<>(datasetSummaries.keySet()); - missingDatasetIds.removeAll(seenDatasetIds); + // Find missing dataset IDs + Set missingDatasetIds = new HashSet<>(datasetSummaries.keySet()); + missingDatasetIds.removeAll(seenDatasetIds); - // If there are missing IDs, add an "Others" section - if (!missingDatasetIds.isEmpty()) { - JSONArray otherSummaries = new JSONArray(); - for (String id : missingDatasetIds) { - otherSummaries.put(datasetSummaries.get(id)); - } + // If there are missing IDs, add an "Others" section + if (!missingDatasetIds.isEmpty()) { + JSONArray otherSummaries = new JSONArray(); + for (String id : missingDatasetIds) { + otherSummaries.put(datasetSummaries.get(id)); + } - JSONObject otherSection = new JSONObject(); - otherSection.put("headline", "Other"); - otherSection.put("one_sentence_summary", "These experiments were not grouped into sub-sections by the AI."); - otherSection.put("summaries", otherSummaries); - deduplicatedSections.put(otherSection); - } - - // Create final deduplicated summary - JSONObject finalSummary = new JSONObject(summaryResponse.toString()); - finalSummary.put("sections", deduplicatedSections); - return finalSummary; + JSONObject otherSection = new JSONObject(); + otherSection.put("headline", "Other"); + otherSection.put("one_sentence_summary", "These experiments were not grouped into sub-sections by the AI."); + otherSection.put("summaries", otherSummaries); + deduplicatedSections.put(otherSection); } + // Create final deduplicated summary + JSONObject finalSummary = new JSONObject(summaryResponse.toString()); + finalSummary.put("sections", deduplicatedSections); + return finalSummary; + } + } From 5752cc496bbb04350cbbb288b0f42f691bcb72cb Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 13 Feb 2025 23:18:59 +0000 Subject: [PATCH 04/31] extra comment --- .../model/report/ai/SingleGeneAiExpressionReporter.java | 1 + 1 file changed, 1 insertion(+) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index f424f3c66..b355b0a3e 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -67,6 +67,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException { try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables)) { RecordInstance singleRecord = recordStream.iterator().next(); + // we will need to pass `_cacheMode` to `summarizeExpression()`... JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord); out.write(expressionSummary.toString().getBytes()); out.flush(); From 4ce3f5fa6e4dd129267bf254d985c37ade53ba9c Mon Sep 17 00:00:00 2001 From: Bob Date: Thu, 13 Feb 2025 23:29:33 +0000 Subject: [PATCH 05/31] one more comment --- .../model/report/ai/expression/GeneRecordProcessor.java | 1 + 1 file changed, 1 insertion(+) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index 0bb32c745..624216b72 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -58,6 +58,7 @@ public static List processExpressionData(RecordInstance geneRecord, List filteredData = new ArrayList<>(); String datasetId = experimentRow.getAttributeValue("dataset_id").getValue(); // add data from `expressionGraphsDataTable` where attribute "dataset_id" equals `datasetId` + // (this would be more efficient with a `Map>` made before the `expressionGraphs` loop) List thisExperimentDataRows = new ArrayList<>(); for (TableValueRow dataRow : expressionGraphsDataTable) { if (dataRow.getAttributeValue("dataset_id").getValue().equals(datasetId)) { From eb203df748816055c55e30d39ae5c49d313cb45f Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Mon, 17 Feb 2025 12:43:06 +0000 Subject: [PATCH 06/31] move CacheMode into separate file --- .../apicommon/model/report/ai/CacheMode.java | 26 +++++++++++++++++++ .../ai/SingleGeneAiExpressionReporter.java | 22 +--------------- .../report/ai/expression/Summarizer.java | 4 ++- 3 files changed, 30 insertions(+), 22 deletions(-) create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java new file mode 100644 index 000000000..d514ca813 --- /dev/null +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java @@ -0,0 +1,26 @@ +package org.apidb.apicommon.model.report.ai; + +public enum CacheMode { + + TEST("test"), + POPULATE("populate"); + + private final String mode; + + CacheMode(String mode) { + this.mode = mode; + } + + public String getMode() { + return mode; + } + + public static CacheMode fromString(String mode) throws IllegalArgumentException { + for (CacheMode cm : CacheMode.values()) { + if (cm.mode.equalsIgnoreCase(mode)) { + return cm; + } + } + throw new IllegalArgumentException("Invalid CacheMode: " + mode); + } +} diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index b355b0a3e..9b8a0f336 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -23,26 +23,6 @@ public class SingleGeneAiExpressionReporter extends AbstractReporter { - public enum CacheMode { - TEST("test"), - POPULATE("populate"); - private final String mode; - CacheMode(String mode) { - this.mode = mode; - } - public String getMode() { - return mode; - } - public static CacheMode fromString(String mode) throws IllegalArgumentException { - for (CacheMode cm : CacheMode.values()) { - if (cm.mode.equalsIgnoreCase(mode)) { - return cm; - } - } - throw new IllegalArgumentException("Invalid CacheMode: " + mode); - } - } - private CacheMode _cacheMode = CacheMode.TEST; @Override @@ -68,7 +48,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException { try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables)) { RecordInstance singleRecord = recordStream.iterator().next(); // we will need to pass `_cacheMode` to `summarizeExpression()`... - JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord); + JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord, _cacheMode); out.write(expressionSummary.toString().getBytes()); out.flush(); } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 98961b7f1..009be461a 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -1,5 +1,7 @@ package org.apidb.apicommon.model.report.ai.expression; +import org.apidb.apicommon.model.report.ai.CacheMode; + import org.gusdb.wdk.model.record.RecordInstance; import org.gusdb.wdk.model.WdkUserException; import org.gusdb.wdk.model.WdkModelException; @@ -105,7 +107,7 @@ public class Summarizer { ) .build(); - public static JSONObject summarizeExpression(RecordInstance geneRecord) throws WdkUserException { + public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException { try { // Process expression data further into a list of pruned metadata plus data From dfab346e1fc60eea9857f3c62f53a3a848b6c574 Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Tue, 18 Feb 2025 09:33:13 +0000 Subject: [PATCH 07/31] WIP cache wiring --- .../ai/expression/AiExpressionCache.java | 109 ++++++++++++++++++ .../ai/expression/GeneRecordProcessor.java | 4 +- .../report/ai/expression/Summarizer.java | 57 +++++++-- 3 files changed, 158 insertions(+), 12 deletions(-) create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java new file mode 100644 index 000000000..76a1cd65d --- /dev/null +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -0,0 +1,109 @@ +package org.apidb.apicommon.model.report.ai.expression; + + +import java.io.IOException; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.nio.file.Files; +import java.security.MessageDigest; +import java.security.NoSuchAlgorithmException; +import java.util.HexFormat; +import java.util.function.Function; +import org.json.JSONObject; +import org.json.JSONException; + +import org.gusdb.fgputil.cache.disk.OnDiskCache; +import org.gusdb.fgputil.functional.FunctionalInterfaces.ConsumerWithException; +import org.gusdb.fgputil.functional.FunctionalInterfaces.FunctionWithException; + +public class AiExpressionCache extends OnDiskCache { + + // Default cache location and timing settings + private static final Path DEFAULT_CACHE_DIR = Paths.get("/tmp/expressionCache"); + private static final long DEFAULT_TIMEOUT_MILLIS = 5000; + private static final long DEFAULT_POLL_FREQUENCY_MILLIS = 500; + + // No-argument constructor using defaults + public AiExpressionCache() throws IOException { + super(DEFAULT_CACHE_DIR, DEFAULT_TIMEOUT_MILLIS, DEFAULT_POLL_FREQUENCY_MILLIS); + } + + // Compute SHA-256 hash digest of input + private static String computeDigest(String input) throws NoSuchAlgorithmException { + MessageDigest digest = MessageDigest.getInstance("SHA-256"); + byte[] hash = digest.digest(input.getBytes()); + return HexFormat.of().formatHex(hash); + } + + // Check if cached data is valid + public boolean isCacheValid(String cacheKey, String inputData) { + + try { + FunctionWithException visitor = entryDir -> { + Path digestFile = entryDir.resolve("digest.txt"); + + if (!Files.exists(digestFile)) { + System.out.println("No digest file found."); + return false; + } + + // Read stored digest and compare + String cachedDigest = Files.readString(digestFile); + String computedDigest = computeDigest(inputData); + + if (cachedDigest.equals(computedDigest)) { + System.out.println("Cache digest matches input."); + return true; + } else { + System.out.println("Cache digest mismatch! Cache is out of date."); + return false; + } + }; + + return visitContent(cacheKey, visitor); + + } catch (EntryNotCreatedException e) { + System.out.println("Cache entry does not exist yet."); + return false; + } catch (Exception e) { + throw new RuntimeException("Error validating cache entry", e); + } + } + + // Populate cache with computed data (Method 1: Takes computedData directly) + public void populateCache(String cacheKey, String inputData, JSONObject computedData) throws Exception { + ConsumerWithException populator = entryDir -> { + Files.writeString(entryDir.resolve("cached_data.txt"), computedData.toString()); + Files.writeString(entryDir.resolve("digest.txt"), computeDigest(inputData)); + }; + + // Populate with overwrite policy (assumes caller ensures it's necessary) + populateAndProcessContent(cacheKey, populator, path -> null, Overwrite.YES); + } + +// // Populate cache with a function that computes the result (Method 2: Uses a function) +// public void populateCache(String cacheKey, String inputData, Function computation) throws Exception { +// populateCache(cacheKey, inputData, computation.apply(inputData)); +// } + + + // Read cached data (throws IOException if missing) + public JSONObject readCachedData(String cacheKey) throws Exception { + FunctionWithException visitor = entryDir -> { + Path file = entryDir.resolve("cached_data.txt"); + if (!Files.exists(file)) { + throw new IOException("Cache entry missing: " + file); + } + String fileContents = Files.readString(file); + try { + JSONObject jsonObject = new JSONObject(fileContents); + return jsonObject; + } catch (JSONException e) { + throw e; + } + }; + + return visitContent(cacheKey, visitor); + } + +} diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index 624216b72..4916fd76c 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -44,12 +44,14 @@ public static List processExpressionData(RecordInstance geneRecord, // return value: List experiments = new ArrayList<>(); + String geneId = geneRecord.getAttributeValue("gene_id").getValue(); TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs"); TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable"); for (TableValueRow experimentRow : expressionGraphs) { JSONObject experimentInfo = new JSONObject(); - + experimentInfo.put("gene_id", geneId); + // Extract all relevant attributes for (String key : KEYS_TO_KEEP) { experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue()); diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 009be461a..b575677e5 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -1,6 +1,7 @@ package org.apidb.apicommon.model.report.ai.expression; import org.apidb.apicommon.model.report.ai.CacheMode; +import org.apidb.apicommon.model.report.ai.expression.AiExpressionCache; import org.gusdb.wdk.model.record.RecordInstance; import org.gusdb.wdk.model.WdkUserException; @@ -25,6 +26,7 @@ import java.util.HashSet; import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; +import java.io.IOException; public class Summarizer { private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder() @@ -32,9 +34,21 @@ public class Summarizer { .maxRetries(32) // Handle 429 errors .build(); + private static final AiExpressionCache cache; + + static { + AiExpressionCache tempCache = null; + try { + tempCache = new AiExpressionCache(); + } catch (IOException e) { + throw new RuntimeException("Failed to initialize AiExpressionCache", e); + } + cache = tempCache; + } + // provide exact model number for semi-reproducibility private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; - private static int MAX_RESPONSE_TOKENS = 5000; + private static int MAX_RESPONSE_TOKENS = 10000; private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data"; @@ -107,15 +121,15 @@ public class Summarizer { ) .build(); - public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException { + public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException { try { // Process expression data further into a list of pruned metadata plus data - List experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord); + List experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord); System.out.println("Pre-processed Experiments: " + experimentsWithData.size()); // Send AI requests in parallel - // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI + // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI List> aiRequests = experimentsWithData.stream() .map(Summarizer::sendExperimentToOpenAI) .collect(Collectors.toList()); @@ -125,12 +139,12 @@ public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMod .collect(Collectors.toList()); // Debug output - // System.out.println("Individual responses:"); + // System.out.println("Individual responses:"); // responses.forEach(response -> System.out.println(response.toString(2))); - // System.exit(0); + // System.exit(0); - JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses); - return finalSummary; + JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses); + return finalSummary; } catch (WdkModelException e) { // Handle errors gracefully @@ -139,7 +153,12 @@ public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMod } } + private static CompletableFuture sendExperimentToOpenAI(JSONObject experiment) { + return sendExperimentToOpenAI(experiment, CacheMode.POPULATE); + } + + private static CompletableFuture sendExperimentToOpenAI(JSONObject experiment, CacheMode cacheMode) { // Possible TO DO: AI EDIT DESCRIPTION // Before sending the experiment+data to the AI, ask the AI to edit the `description` field @@ -152,12 +171,15 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e - // We don't need to send the dataset_id to the AI but it's useful to have in the - // response for phase two - so we save it for later + // We don't need to send the gene_id or dataset_id to the AI but we need the gene ID + // for the cache key and it's useful to have dataset_id in the response for phase two + // - so we save them for later JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone + String geneId = experimentForAI.has("gene_id") ? experimentForAI.getString("gene_id") : null; + experimentForAI.remove("gene_id"); String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null; experimentForAI.remove("dataset_id"); - + String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" + "```json\n%s\n```\n\n".formatted(experimentForAI.toString()) + "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" + @@ -166,6 +188,19 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e // System.out.println(message); /// DEBUG + String cacheKey = geneId + ':' + datasetId; + + if (cache.isCacheValid(cacheKey, message)) { + try { + return CompletableFuture.completedFuture(cache.readCachedData(cacheKey)); +// } catch (IOException e) { +// // maybe log that the cache was unexpectedly invalidated +// // and then continue to compute and populate cache entry + } catch (Exception e) { + // do nothing + } + } + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() .model(model) .maxCompletionTokens(MAX_RESPONSE_TOKENS) From 813d2f148b2773fcda9af6bb5572cab7e4621086 Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Tue, 18 Feb 2025 19:57:09 +0000 Subject: [PATCH 08/31] more cache wrangling --- .../ai/expression/GeneRecordProcessor.java | 2 - .../report/ai/expression/Summarizer.java | 102 ++++++++++++------ 2 files changed, 70 insertions(+), 34 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index 4916fd76c..1320fd709 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -44,13 +44,11 @@ public static List processExpressionData(RecordInstance geneRecord, // return value: List experiments = new ArrayList<>(); - String geneId = geneRecord.getAttributeValue("gene_id").getValue(); TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs"); TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable"); for (TableValueRow experimentRow : expressionGraphs) { JSONObject experimentInfo = new JSONObject(); - experimentInfo.put("gene_id", geneId); // Extract all relevant attributes for (String key : KEYS_TO_KEEP) { diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index b575677e5..ba3ee6754 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -20,10 +20,12 @@ import com.openai.models.ResponseFormatJsonSchema.JsonSchema; import com.openai.core.JsonValue; import java.util.List; +import java.util.ArrayList; import java.util.Map; import java.util.HashMap; import java.util.Set; import java.util.HashSet; +import java.util.Optional; import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; import java.io.IOException; @@ -124,28 +126,57 @@ public class Summarizer { public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException { try { + String geneId = geneRecord.getAttributeValue("gene_id").getValue(); + // Process expression data further into a list of pruned metadata plus data List experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord); System.out.println("Pre-processed Experiments: " + experimentsWithData.size()); + // TEST Mode: Collect valid cache entries + if (cacheMode == CacheMode.TEST) { + List cachedResponses = new ArrayList<>(); + + for (JSONObject experiment : experimentsWithData) { + + Optional experimentSummary = Summarizer.sendExperimentToOpenAI(geneId, experiment, CacheMode.TEST).join(); + + if (experimentSummary.isPresent()) { + cachedResponses.add(experimentSummary.get()); + } else { + return new JSONObject().put("cacheStatus", "miss"); // If any cache entry is missing, return early + } + } + + // All experiment-level caches are valid, now check final summary cache + Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, CacheMode.TEST); + return finalSummary + .map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary)) + .orElseGet(() -> new JSONObject().put("cacheStatus", "miss")); + } + + // Send AI requests in parallel - // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI - List> aiRequests = experimentsWithData.stream() - .map(Summarizer::sendExperimentToOpenAI) + List>> aiRequests = experimentsWithData.stream() + // TO DO - potentially some optimisation? + // .map(exp -> CompletableFuture.supplyAsync(() -> sendExperimentToOpenAI(geneId, exp, CacheMode.POPULATE))) + .map(exp -> sendExperimentToOpenAI(geneId, exp, CacheMode.POPULATE)) .collect(Collectors.toList()); - // Wait for all requests to complete + // Wait for all requests to complete with `join` List responses = aiRequests.stream() - .map(CompletableFuture::join) // Blocks until each completes - .collect(Collectors.toList()); + .map(CompletableFuture::join) // Get Optional + .filter(Optional::isPresent) // Keep only non-empty results + .map(Optional::get) // Extract JSONObject + .collect(Collectors.toList()); // Debug output // System.out.println("Individual responses:"); // responses.forEach(response -> System.out.println(response.toString(2))); // System.exit(0); - JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses); - return finalSummary; - + Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, CacheMode.POPULATE); + return finalSummary + .map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary)) + .orElseGet(() -> new JSONObject().put("cacheStatus", "miss")); } catch (WdkModelException e) { // Handle errors gracefully System.err.println("Error fetching expression data: " + e.getMessage()); @@ -154,11 +185,7 @@ public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMod } - private static CompletableFuture sendExperimentToOpenAI(JSONObject experiment) { - return sendExperimentToOpenAI(experiment, CacheMode.POPULATE); - } - - private static CompletableFuture sendExperimentToOpenAI(JSONObject experiment, CacheMode cacheMode) { + private static CompletableFuture> sendExperimentToOpenAI(String geneId, JSONObject experiment, CacheMode cacheMode) { // Possible TO DO: AI EDIT DESCRIPTION // Before sending the experiment+data to the AI, ask the AI to edit the `description` field @@ -171,12 +198,9 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e - // We don't need to send the gene_id or dataset_id to the AI but we need the gene ID - // for the cache key and it's useful to have dataset_id in the response for phase two - // - so we save them for later + // We don't need to send dataset_id to the AI but it's useful to have it + // in the response for phase two JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone - String geneId = experimentForAI.has("gene_id") ? experimentForAI.getString("gene_id") : null; - experimentForAI.remove("gene_id"); String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null; experimentForAI.remove("dataset_id"); @@ -192,15 +216,20 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e if (cache.isCacheValid(cacheKey, message)) { try { - return CompletableFuture.completedFuture(cache.readCachedData(cacheKey)); -// } catch (IOException e) { -// // maybe log that the cache was unexpectedly invalidated -// // and then continue to compute and populate cache entry + JSONObject cachedResponse = cache.readCachedData(cacheKey); + return CompletableFuture.completedFuture(Optional.of(cachedResponse)); } catch (Exception e) { - // do nothing + System.err.println("Cache read failed for key " + cacheKey + ": " + e.getMessage()); + + if (cacheMode == CacheMode.TEST) { + return CompletableFuture.completedFuture(Optional.empty()); // Treat as cache miss + } + // Else, log and fall through to AI generation } + } else if (cacheMode == CacheMode.TEST) { + return CompletableFuture.completedFuture(Optional.empty()); } - + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() .model(model) .maxCompletionTokens(MAX_RESPONSE_TOKENS) @@ -212,7 +241,6 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e .build()) .addSystemMessage(systemMessage) .addUserMessage(message) - // .temperature(1.0) .build(); // add dataset_id back to the response @@ -223,17 +251,27 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e try { JSONObject jsonObject = new JSONObject(jsonString); jsonObject.put("dataset_id", datasetId); - return jsonObject; + + // Cache the response + try { + cache.populateCache(cacheKey, message, jsonObject); + } catch (Exception e) { + System.err.println("Warning: Failed to cache response for gene " + geneId + + " and dataset " + datasetId + ": " + e.getMessage()); + } + + return Optional.of(jsonObject); } catch (JSONException e) { - System.err.println("Error parsing JSON response for dataset " + datasetId + ": " + e.getMessage()); + System.err.println("Error parsing JSON response for gene " + geneId + " and dataset " + datasetId + ": " + e.getMessage()); System.err.println("Raw response: " + jsonString); - return new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId); + JSONObject errorResponse = new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId); + return Optional.of(errorResponse); } }); } - private static JSONObject sendExperimentSummariesToOpenAI(List experiments) { + private static Optional sendExperimentSummariesToOpenAI(String geneId, List experiments, CacheMode cacheMode) { String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + "```json\n%s\n```\n\n".formatted(new JSONArray(experiments)) + @@ -258,10 +296,10 @@ private static JSONObject sendExperimentSummariesToOpenAI(List exper String jsonString = completion.choices().get(0).message().content().get(); JSONObject rawResponseObject = new JSONObject(jsonString); - // TO DO - quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI + // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments); - return finalResponseObject; + return Optional.of(finalResponseObject); } From 5c323c369c1194e16a95cad5e35e1068552bdba1 Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Tue, 18 Feb 2025 16:17:48 -0500 Subject: [PATCH 09/31] a few changes for error handling and to set up caching --- .../apicommon/model/report/ai/CacheMode.java | 24 +-- .../ai/SingleGeneAiExpressionReporter.java | 83 ++++++---- .../report/ai/expression/ExpressionData.java | 25 +-- .../ai/expression/GeneRecordProcessor.java | 150 +++++++++++------- .../report/ai/expression/Summarizer.java | 71 ++++----- 5 files changed, 192 insertions(+), 161 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java index d514ca813..110c95967 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java @@ -1,26 +1,6 @@ package org.apidb.apicommon.model.report.ai; public enum CacheMode { - - TEST("test"), - POPULATE("populate"); - - private final String mode; - - CacheMode(String mode) { - this.mode = mode; - } - - public String getMode() { - return mode; - } - - public static CacheMode fromString(String mode) throws IllegalArgumentException { - for (CacheMode cm : CacheMode.values()) { - if (cm.mode.equalsIgnoreCase(mode)) { - return cm; - } - } - throw new IllegalArgumentException("Invalid CacheMode: " + mode); - } + TEST, + POPULATE; } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index 9b8a0f336..29cd28036 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -1,65 +1,78 @@ package org.apidb.apicommon.model.report.ai; -import org.gusdb.wdk.model.report.AbstractReporter; -import org.gusdb.wdk.model.report.Reporter; -import org.gusdb.wdk.model.report.ReporterConfigException; +import java.io.BufferedWriter; +import java.io.IOException; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.util.List; +import java.util.Map; +import java.util.stream.Collectors; + +import org.apidb.apicommon.model.TranscriptUtil; +import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor; import org.apidb.apicommon.model.report.ai.expression.Summarizer; import org.gusdb.wdk.model.WdkModelException; -import org.gusdb.wdk.model.record.RecordClass; -import org.gusdb.wdk.model.record.RecordInstance; +import org.gusdb.wdk.model.WdkUserException; import org.gusdb.wdk.model.answer.stream.RecordStream; import org.gusdb.wdk.model.answer.stream.RecordStreamFactory; -import org.apidb.apicommon.model.TranscriptUtil; +import org.gusdb.wdk.model.record.RecordClass; +import org.gusdb.wdk.model.record.RecordInstance; import org.gusdb.wdk.model.record.TableField; -import org.gusdb.wdk.model.WdkModelException; -import org.gusdb.wdk.model.WdkUserException; - +import org.gusdb.wdk.model.report.AbstractReporter; +import org.gusdb.wdk.model.report.Reporter; +import org.gusdb.wdk.model.report.ReporterConfigException; +import org.json.JSONException; import org.json.JSONObject; -import java.io.IOException; -import java.io.OutputStream; -import java.util.Map; -import java.util.List; -import java.util.stream.Collectors; -public class SingleGeneAiExpressionReporter extends AbstractReporter { +public class SingleGeneAiExpressionReporter extends AbstractReporter { + + private static final int MAX_RESULT_SIZE = 1; // one gene at a time for now private CacheMode _cacheMode = CacheMode.TEST; - + @Override public Reporter configure(JSONObject config) throws ReporterConfigException, WdkModelException { try { + // assign cache mode if (config.has("cacheMode")) { - _cacheMode = CacheMode.fromString(config.getString("cacheMode")); + _cacheMode = CacheMode.valueOf(config.getString("cacheMode").toUpperCase()); + } + + // check model config; this should only be assigned to genes + RecordClass geneRecordClass = TranscriptUtil.getGeneRecordClass(_wdkModel); + if (_baseAnswer.getQuestion().getRecordClass() != geneRecordClass) { + throw new WdkModelException(SingleGeneAiExpressionReporter.class.getName() + + " should only be assigned to " + geneRecordClass.getFullName()); + } + + // check result size; limit to small results due to OpenAI cost + if (_baseAnswer.getResultSizeFactory().getResultSize() > MAX_RESULT_SIZE) { + throw new ReporterConfigException("This reporter cannot be called with results of size greater than " + MAX_RESULT_SIZE); } - } catch (IllegalArgumentException e) { - throw new ReporterConfigException("Invalid cacheMode value: " + config.getString("cacheMode"), e); + } + catch (JSONException | IllegalArgumentException e) { + throw new ReporterConfigException("Invalid cacheMode value: " + config.get("cacheMode"), e); } return this; } @Override protected void write(OutputStream out) throws IOException, WdkModelException { - RecordClass geneRecordClass = TranscriptUtil.getGeneRecordClass(_wdkModel); - Map tableFields = geneRecordClass.getTableFieldMap(); - List tables = List.of("ExpressionGraphs", "ExpressionGraphsDataTable").stream() - .map(name -> tableFields.get(name)) - .collect(Collectors.toList()); - try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables)) { - RecordInstance singleRecord = recordStream.iterator().next(); - // we will need to pass `_cacheMode` to `summarizeExpression()`... - JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord, _cacheMode); - out.write(expressionSummary.toString().getBytes()); - out.flush(); + Map tableFields = _baseAnswer.getQuestion().getRecordClass().getTableFieldMap(); + List tables = GeneRecordProcessor.REQUIRED_TABLE_NAMES.stream() + .map(name -> tableFields.get(name)).collect(Collectors.toList()); + + try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables); + BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out))) { + for (RecordInstance record : recordStream) { + JSONObject expressionSummary = Summarizer.summarizeExpression(record, _cacheMode); + writer.write(expressionSummary.toString()); + } } catch (WdkUserException e) { throw new WdkModelException(e); } - - } - } - - diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java index c2e688878..9d807d770 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java @@ -4,19 +4,20 @@ import java.util.List; public class ExpressionData { - private final List expressionGraphs; - private final List expressionGraphsDataTable; - public ExpressionData(List expressionGraphs, List expressionGraphsDataTable) { - this.expressionGraphs = expressionGraphs; - this.expressionGraphsDataTable = expressionGraphsDataTable; - } + private final List expressionGraphs; + private final List expressionGraphsDataTable; - public List getExpressionGraphs() { - return expressionGraphs; - } + public ExpressionData(List expressionGraphs, List expressionGraphsDataTable) { + this.expressionGraphs = expressionGraphs; + this.expressionGraphsDataTable = expressionGraphsDataTable; + } - public List getExpressionGraphsDataTable() { - return expressionGraphsDataTable; - } + public List getExpressionGraphs() { + return expressionGraphs; + } + + public List getExpressionGraphsDataTable() { + return expressionGraphsDataTable; + } } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index 624216b72..09b30685a 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -1,84 +1,122 @@ package org.apidb.apicommon.model.report.ai.expression; -import org.gusdb.wdk.model.record.RecordInstance; -import org.gusdb.wdk.model.record.TableValue; -import org.gusdb.wdk.model.record.TableValueRow; -import org.gusdb.wdk.model.WdkUserException; -import org.gusdb.wdk.model.WdkModelException; - -import org.json.JSONArray; -import org.json.JSONObject; import java.util.ArrayList; import java.util.List; import java.util.Set; import java.util.stream.Collectors; -import java.util.stream.StreamSupport; + +import org.gusdb.fgputil.EncryptionUtil; +import org.gusdb.fgputil.json.JsonUtil; +import org.gusdb.wdk.model.WdkModelException; +import org.gusdb.wdk.model.WdkUserException; +import org.gusdb.wdk.model.record.RecordInstance; +import org.gusdb.wdk.model.record.TableValue; +import org.gusdb.wdk.model.record.TableValueRow; +import org.json.JSONObject; /** * expects a geneRecord with two tables: "ExpressionGraphs" and "ExpressionGraphsDataTable" * * returns a list of JSON Objects of data ready to feed the AI */ - public class GeneRecordProcessor { - private static final Set KEYS_TO_KEEP = - Set.of( - "y_axis", "description", "genus_species", "project_id", "summary", "dataset_id", - "assay_type", "x_axis", "module", "dataset_name", "display_name", "short_attribution", "paralog_number" - ); - public static List processExpressionData(RecordInstance geneRecord) throws WdkModelException, WdkUserException { + private static final Set KEYS_TO_KEEP = Set.of("y_axis", "description", "genus_species", + "project_id", "summary", "dataset_id", "assay_type", "x_axis", "module", "dataset_name", "display_name", + "short_attribution", "paralog_number"); + + private static final String EXPRESSION_GRAPH_TABLE = "ExpressionGraphs"; + private static final String EXPRESSION_GRAPH_DATA_TABLE = "ExpressionGraphsDataTable"; + + public static final List REQUIRED_TABLE_NAMES = List.of(EXPRESSION_GRAPH_TABLE, EXPRESSION_GRAPH_DATA_TABLE); + + public interface GeneSummaryInputs { + + String getGeneId(); + + List getExperimentsWithData(); + + default String getExperimentsDigest() { + return EncryptionUtil.md5(getExperimentsWithData().stream() + .map(JsonUtil::serialize).collect(Collectors.joining())); + } + } + + public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record) throws WdkModelException { + String geneId = record.getPrimaryKey().getValues().get("gene_source_id"); + List experimentsWithData = GeneRecordProcessor.processExpressionData(record); + return new GeneSummaryInputs() { + @Override + public String getGeneId() { + return geneId; + } + @Override + public List getExperimentsWithData() { + return experimentsWithData; + } + }; + } + + static List processExpressionData(RecordInstance geneRecord) + throws WdkModelException { return processExpressionData(geneRecord, 0); } // for debugging only - public static List processExpressionData(RecordInstance geneRecord, String datasetId) throws WdkModelException, WdkUserException { + static List processExpressionData(RecordInstance geneRecord, String datasetId) throws WdkModelException { List experiments = processExpressionData(geneRecord, 0); - return experiments.stream() - .filter(experiment -> datasetId.equals(experiment.getString("dataset_id"))) - .collect(Collectors.toList()); + return experiments.stream().filter( + experiment -> datasetId.equals(experiment.getString("dataset_id"))).collect(Collectors.toList()); } // maxExperiments is for dev/debugging only - public static List processExpressionData(RecordInstance geneRecord, int maxExperiments) throws WdkModelException, WdkUserException { - // return value: - List experiments = new ArrayList<>(); - - TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs"); - TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable"); - - for (TableValueRow experimentRow : expressionGraphs) { - JSONObject experimentInfo = new JSONObject(); - - // Extract all relevant attributes - for (String key : KEYS_TO_KEEP) { - experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue()); - } - - List filteredData = new ArrayList<>(); - String datasetId = experimentRow.getAttributeValue("dataset_id").getValue(); - // add data from `expressionGraphsDataTable` where attribute "dataset_id" equals `datasetId` - // (this would be more efficient with a `Map>` made before the `expressionGraphs` loop) - List thisExperimentDataRows = new ArrayList<>(); - for (TableValueRow dataRow : expressionGraphsDataTable) { - if (dataRow.getAttributeValue("dataset_id").getValue().equals(datasetId)) { - JSONObject dataEntry = new JSONObject(); - - // Extract relevant numeric fields - List dataKeys = List.of("value", "standard_error", "percentile_channel1", "percentile_channel2", "sample_name"); - for (String key : dataKeys) { - dataEntry.put(key, dataRow.getAttributeValue(key).getValue()); + static List processExpressionData(RecordInstance geneRecord, int maxExperiments) + throws WdkModelException { + try { + // return value: + List experiments = new ArrayList<>(); + + TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs"); + TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable"); + + for (TableValueRow experimentRow : expressionGraphs) { + JSONObject experimentInfo = new JSONObject(); + + // Extract all relevant attributes + for (String key : KEYS_TO_KEEP) { + experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue()); + } + + List filteredData = new ArrayList<>(); + String datasetId = experimentRow.getAttributeValue("dataset_id").getValue(); + // add data from `expressionGraphsDataTable` where attribute "dataset_id" equals `datasetId` + // (this would be more efficient with a `Map>` made before the + // `expressionGraphs` loop) + for (TableValueRow dataRow : expressionGraphsDataTable) { + if (dataRow.getAttributeValue("dataset_id").getValue().equals(datasetId)) { + JSONObject dataEntry = new JSONObject(); + + // Extract relevant numeric fields + List dataKeys = List.of("value", "standard_error", "percentile_channel1", + "percentile_channel2", "sample_name"); + for (String key : dataKeys) { + dataEntry.put(key, dataRow.getAttributeValue(key).getValue()); + } + + filteredData.add(dataEntry); } - - filteredData.add(dataEntry); } + + experimentInfo.put("data", filteredData); + experiments.add(experimentInfo); + + if (maxExperiments > 0 && experiments.size() >= maxExperiments) + break; } - - experimentInfo.put("data", filteredData); - experiments.add(experimentInfo); - - if (maxExperiments > 0 && experiments.size() >= maxExperiments) break; + return experiments; + } + catch (WdkUserException e) { + throw new WdkModelException(e.getMessage()); } - return experiments; } } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 009be461a..4b78d6c58 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -1,32 +1,32 @@ package org.apidb.apicommon.model.report.ai.expression; -import org.apidb.apicommon.model.report.ai.CacheMode; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.stream.Collectors; -import org.gusdb.wdk.model.record.RecordInstance; -import org.gusdb.wdk.model.WdkUserException; +import org.apidb.apicommon.model.report.ai.CacheMode; import org.gusdb.wdk.model.WdkModelException; - -import org.json.JSONObject; +import org.gusdb.wdk.model.WdkUserException; +import org.gusdb.wdk.model.record.RecordInstance; import org.json.JSONArray; import org.json.JSONException; +import org.json.JSONObject; import com.openai.client.OpenAIClientAsync; import com.openai.client.okhttp.OpenAIOkHttpClientAsync; +import com.openai.core.JsonValue; +import com.openai.models.ChatCompletion; import com.openai.models.ChatCompletionCreateParams; import com.openai.models.ChatModel; -import com.openai.models.ChatCompletion; import com.openai.models.ResponseFormatJsonSchema; import com.openai.models.ResponseFormatJsonSchema.JsonSchema; -import com.openai.core.JsonValue; -import java.util.List; -import java.util.Map; -import java.util.HashMap; -import java.util.Set; -import java.util.HashSet; -import java.util.concurrent.CompletableFuture; -import java.util.stream.Collectors; public class Summarizer { + private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder() .fromEnv() // Uses OPENAI_API_KEY from env .maxRetries(32) // Handle 429 errors @@ -106,33 +106,34 @@ public class Summarizer { ) ) .build(); - - public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException { - + + public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException { + try { // Process expression data further into a list of pruned metadata plus data - List experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord); + List experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord); System.out.println("Pre-processed Experiments: " + experimentsWithData.size()); - + // Send AI requests in parallel - // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI + // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI List> aiRequests = experimentsWithData.stream() - .map(Summarizer::sendExperimentToOpenAI) - .collect(Collectors.toList()); + .map(Summarizer::sendExperimentToOpenAI) + .collect(Collectors.toList()); // Wait for all requests to complete List responses = aiRequests.stream() - .map(CompletableFuture::join) // Blocks until each completes - .collect(Collectors.toList()); + .map(CompletableFuture::join) // Blocks until each completes + .collect(Collectors.toList()); // Debug output - // System.out.println("Individual responses:"); + // System.out.println("Individual responses:"); // responses.forEach(response -> System.out.println(response.toString(2))); - // System.exit(0); - - JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses); - return finalSummary; + // System.exit(0); - } catch (WdkModelException e) { + JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses); + return finalSummary; + + } + catch (WdkModelException e) { // Handle errors gracefully System.err.println("Error fetching expression data: " + e.getMessage()); throw new WdkUserException(e); @@ -150,16 +151,14 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e // // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below. - - // We don't need to send the dataset_id to the AI but it's useful to have in the // response for phase two - so we save it for later JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null; experimentForAI.remove("dataset_id"); - String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" + - "```json\n%s\n```\n\n".formatted(experimentForAI.toString()) + + String message = String.format("The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" + + "```json\n%s\n```\n\n", experimentForAI.toString()) + "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" + "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" + "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields."; @@ -200,8 +199,8 @@ private static CompletableFuture sendExperimentToOpenAI(JSONObject e private static JSONObject sendExperimentSummariesToOpenAI(List experiments) { - String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + - "```json\n%s\n```\n\n".formatted(new JSONArray(experiments)) + + String message = String.format("Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + + "```json\n%s\n```\n\n", new JSONArray(experiments).toString()) + "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response."; ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() From 0e6b9451980d7e5b46759c035d63be4714ae67ba Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Thu, 20 Feb 2025 11:14:26 -0500 Subject: [PATCH 10/31] Massage class roles --- .../apicommon/model/report/ai/CacheMode.java | 6 - .../ai/SingleGeneAiExpressionReporter.java | 52 ++- .../ai/expression/AiExpressionCache.java | 112 ++++--- .../report/ai/expression/ExpressionData.java | 23 -- .../ai/expression/GeneRecordProcessor.java | 85 +++-- .../report/ai/expression/Summarizer.java | 297 +++++++++--------- 6 files changed, 325 insertions(+), 250 deletions(-) delete mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java delete mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java deleted file mode 100644 index 110c95967..000000000 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java +++ /dev/null @@ -1,6 +0,0 @@ -package org.apidb.apicommon.model.report.ai; - -public enum CacheMode { - TEST, - POPULATE; -} diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index 29cd28036..ee0eb86eb 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -9,8 +9,10 @@ import java.util.stream.Collectors; import org.apidb.apicommon.model.TranscriptUtil; +import org.apidb.apicommon.model.report.ai.expression.AiExpressionCache; import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor; import org.apidb.apicommon.model.report.ai.expression.Summarizer; +import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs; import org.gusdb.wdk.model.WdkModelException; import org.gusdb.wdk.model.WdkUserException; import org.gusdb.wdk.model.answer.stream.RecordStream; @@ -28,15 +30,15 @@ public class SingleGeneAiExpressionReporter extends AbstractReporter { private static final int MAX_RESULT_SIZE = 1; // one gene at a time for now - private CacheMode _cacheMode = CacheMode.TEST; + private static final String POPULATION_MODE_PROP_KEY = "populateIfNotPresent"; + + private boolean _populateIfNotPresent; @Override public Reporter configure(JSONObject config) throws ReporterConfigException, WdkModelException { try { // assign cache mode - if (config.has("cacheMode")) { - _cacheMode = CacheMode.valueOf(config.getString("cacheMode").toUpperCase()); - } + _populateIfNotPresent = config.optBoolean(POPULATION_MODE_PROP_KEY, false); // check model config; this should only be assigned to genes RecordClass geneRecordClass = TranscriptUtil.getGeneRecordClass(_wdkModel); @@ -59,19 +61,53 @@ public Reporter configure(JSONObject config) throws ReporterConfigException, Wdk @Override protected void write(OutputStream out) throws IOException, WdkModelException { + // get table fields needed to produce summary inputs Map tableFields = _baseAnswer.getQuestion().getRecordClass().getTableFieldMap(); List tables = GeneRecordProcessor.REQUIRED_TABLE_NAMES.stream() .map(name -> tableFields.get(name)).collect(Collectors.toList()); + // open summary cache (manages persistence of expression data) + AiExpressionCache cache = AiExpressionCache.getInstance(_wdkModel); + + // create summarizer (interacts with OpenAI) + Summarizer summarizer = new Summarizer(_wdkModel); + + // open record and output streams try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables); BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out))) { + + // write a JSON object with gene ID keys and expression summary values + writer.write("{"); + boolean firstRecord = true; for (RecordInstance record : recordStream) { - JSONObject expressionSummary = Summarizer.summarizeExpression(record, _cacheMode); - writer.write(expressionSummary.toString()); + + // create summary inputs + GeneSummaryInputs summaryInputs = GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer::getExperimentMessage); + + // fetch summary, producing if necessary and requested + JSONObject expressionSummary = _populateIfNotPresent + ? getSummary(summaryInputs, summarizer, cache) + : readSummary(summaryInputs, cache); + + // join entries with commas + if (firstRecord) firstRecord = false; else writer.write(","); + + // write JSON object + writer.write("\"" + summaryInputs.getGeneId() + "\":" + expressionSummary.toString()); + } } - catch (WdkUserException e) { - throw new WdkModelException(e); + } + + private JSONObject getSummary(GeneSummaryInputs summaryInputs, Summarizer summarizer, AiExpressionCache cache) { + try { + + } + } + + private JSONObject readSummary(GeneSummaryInputs summaryInputs, AiExpressionCache cache) { + try { + } } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java index 2a9291dc4..1b48ea698 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -1,75 +1,109 @@ package org.apidb.apicommon.model.report.ai.expression; import java.io.IOException; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; -import java.nio.file.Files; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.function.Function; -import org.json.JSONObject; -import org.json.JSONException; +import java.util.Optional; +import java.util.Set; +import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs; import org.gusdb.fgputil.cache.disk.OnDiskCache; +import org.gusdb.fgputil.cache.disk.OnDiskCache.EntryNotCreatedException; +import org.gusdb.fgputil.cache.disk.OnDiskCache.Overwrite; import org.gusdb.fgputil.functional.FunctionalInterfaces.ConsumerWithException; import org.gusdb.fgputil.functional.FunctionalInterfaces.FunctionWithException; +import org.gusdb.wdk.model.WdkModel; +import org.json.JSONException; +import org.json.JSONObject; -public class AiExpressionCache extends OnDiskCache { +public class AiExpressionCache { + + // constants to determine cache location + private static final String CACHE_DIR_PROP_NAME = "AI_EXPRESSION_CACHE_DIR"; + private static final String DEFAULT_TMP_CACHE_SUBDIR = "expressionCache"; - // Default cache location and timing settings - private static final Path DEFAULT_CACHE_DIR = Paths.get("/tmp/expressionCache"); private static final long DEFAULT_TIMEOUT_MILLIS = 5000; private static final long DEFAULT_POLL_FREQUENCY_MILLIS = 500; - // No-argument constructor using defaults - public AiExpressionCache() throws IOException { - super(DEFAULT_CACHE_DIR, DEFAULT_TIMEOUT_MILLIS, DEFAULT_POLL_FREQUENCY_MILLIS); + // singleton pattern + private static AiExpressionCache _instance; + + public static synchronized AiExpressionCache getInstance(WdkModel wdkModel) throws IOException { + if (_instance == null) { + _instance = new AiExpressionCache(wdkModel); + } + else if (_instance._wdkModel != wdkModel) { + // callers should always use the same model + throw new IllegalStateException("Attempt to get instance with different model than previously used."); + } + return _instance; } - // Check if cached data is valid - public boolean isCacheValid(String cacheKey, String inputData) { + private final WdkModel _wdkModel; + private final OnDiskCache _cache; + public AiExpressionCache(WdkModel wdkModel) throws IOException { + _wdkModel = wdkModel; + + Path cacheParentDir = Optional + .ofNullable(_wdkModel.getProperties().get(CACHE_DIR_PROP_NAME)) + .map(Paths::get) + .orElse(Paths.get(_wdkModel.getModelConfig().getWdkTempDir().toString(), DEFAULT_TMP_CACHE_SUBDIR)); + + _cache = new OnDiskCache(cacheParentDir, DEFAULT_TIMEOUT_MILLIS, DEFAULT_POLL_FREQUENCY_MILLIS); + + } + + public void blah() { + _cache.populateAndProcessContent(geneId, populator, visitor, overwritePredicate) + } + + // Check if cached data is valid + public boolean isCacheValid(GeneSummaryInputs summaryInputs) { try { FunctionWithException visitor = entryDir -> { - Path digestFile = entryDir.resolve("digest.txt"); - - if (!Files.exists(digestFile)) { - System.out.println("No digest file found."); - return false; - } - - // Read stored digest and compare - String cachedDigest = Files.readString(digestFile); - String computedDigest = computeDigest(inputData); - - if (cachedDigest.equals(computedDigest)) { - System.out.println("Cache digest matches input."); - return true; - } else { - System.out.println("Cache digest mismatch! Cache is out of date."); - return false; - } + Path digestFile = entryDir.resolve("digest.txt"); + + if (!Files.exists(digestFile)) { + System.out.println("No digest file found."); + return false; + } + + // Read stored digest and compare + String cachedDigest = Files.readString(digestFile); + + if (cachedDigest.equals(summaryInputs.getExperimentsDigest())) { + System.out.println("Cache digest matches input."); + return true; + } + else { + System.out.println("Cache digest mismatch! Cache is out of date."); + return false; + } }; - return visitContent(cacheKey, visitor); + return _cache.visitContent(summaryInputs.getGeneId(), visitor); - } catch (EntryNotCreatedException e) { + } + catch (EntryNotCreatedException e) { System.out.println("Cache entry does not exist yet."); return false; - } catch (Exception e) { + } + catch (Exception e) { throw new RuntimeException("Error validating cache entry", e); } } // Populate cache with computed data (Method 1: Takes computedData directly) - public void populateCache(String cacheKey, String inputData, JSONObject computedData) throws Exception { + public void populateCache(GeneSummaryInputs summaryInputs, JSONObject computedData) throws Exception { ConsumerWithException populator = entryDir -> { Files.writeString(entryDir.resolve("cached_data.txt"), computedData.toString()); - Files.writeString(entryDir.resolve("digest.txt"), computeDigest(inputData)); + Files.writeString(entryDir.resolve("digest.txt"), summaryInputs.getExperimentsDigest()); }; // Populate with overwrite policy (assumes caller ensures it's necessary) - populateAndProcessContent(cacheKey, populator, path -> null, Overwrite.YES); + _cache.populateAndProcessContent(summaryInputs.getGeneId(), populator, path -> null, Overwrite.YES); } // // Populate cache with a function that computes the result (Method 2: Uses a function) @@ -94,7 +128,7 @@ public JSONObject readCachedData(String cacheKey) throws Exception { } }; - return visitContent(cacheKey, visitor); + return _cache.visitContent(cacheKey, visitor); } } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java deleted file mode 100644 index 9d807d770..000000000 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java +++ /dev/null @@ -1,23 +0,0 @@ -package org.apidb.apicommon.model.report.ai.expression; - -import org.json.JSONObject; -import java.util.List; - -public class ExpressionData { - - private final List expressionGraphs; - private final List expressionGraphsDataTable; - - public ExpressionData(List expressionGraphs, List expressionGraphsDataTable) { - this.expressionGraphs = expressionGraphs; - this.expressionGraphsDataTable = expressionGraphsDataTable; - } - - public List getExpressionGraphs() { - return expressionGraphs; - } - - public List getExpressionGraphsDataTable() { - return expressionGraphsDataTable; - } -} diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index fce627768..807cec0f3 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -1,8 +1,11 @@ package org.apidb.apicommon.model.report.ai.expression; import java.util.ArrayList; +import java.util.LinkedHashMap; import java.util.List; +import java.util.Map; import java.util.Set; +import java.util.function.Function; import java.util.stream.Collectors; import org.gusdb.fgputil.EncryptionUtil; @@ -30,53 +33,61 @@ public class GeneRecordProcessor { public static final List REQUIRED_TABLE_NAMES = List.of(EXPRESSION_GRAPH_TABLE, EXPRESSION_GRAPH_DATA_TABLE); + public interface ExperimentInputs { + + String getCacheKey(); + + String getDigest(); + + JSONObject getExperimentData(); + } + public interface GeneSummaryInputs { - String getGeneId(); + String getGeneId(); // is the cache key - List getExperimentsWithData(); + Map getExperimentsWithData(); default String getExperimentsDigest() { - return EncryptionUtil.md5(getExperimentsWithData().stream() - .map(JsonUtil::serialize).collect(Collectors.joining())); + // TODO Does it make more sense to md5 the concatenation of the experiment hashes? + return EncryptionUtil.md5(getExperimentsWithData().values().stream() + .map(ExperimentInputs::getExperimentData) + .map(JsonUtil::serialize) + .collect(Collectors.joining())); } } - public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record) throws WdkModelException { - String geneId = record.getPrimaryKey().getValues().get("gene_source_id"); - List experimentsWithData = GeneRecordProcessor.processExpressionData(record); + private static String getGeneId(RecordInstance record) { + return record.getPrimaryKey().getValues().get("gene_source_id"); + } + + public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, Function experimentDigester) throws WdkModelException { + + String geneId = getGeneId(record); + + Map experimentsWithData = GeneRecordProcessor.processExpressionData(record, experimentDigester, 0); + return new GeneSummaryInputs() { @Override public String getGeneId() { return geneId; } + @Override - public List getExperimentsWithData() { + public Map getExperimentsWithData() { return experimentsWithData; } }; } - static List processExpressionData(RecordInstance geneRecord) - throws WdkModelException { - return processExpressionData(geneRecord, 0); - } - - // for debugging only - static List processExpressionData(RecordInstance geneRecord, String datasetId) throws WdkModelException { - List experiments = processExpressionData(geneRecord, 0); - return experiments.stream().filter( - experiment -> datasetId.equals(experiment.getString("dataset_id"))).collect(Collectors.toList()); - } - - // maxExperiments is for dev/debugging only - static List processExpressionData(RecordInstance geneRecord, int maxExperiments) throws WdkModelException { + private static Map processExpressionData(RecordInstance record, Function getExperimentPrompt, int maxExperiments) throws WdkModelException { try { // return value: - List experiments = new ArrayList<>(); + Map experiments = new LinkedHashMap<>(); - TableValue expressionGraphs = geneRecord.getTableValue(EXPRESSION_GRAPH_TABLE); - TableValue expressionGraphsDataTable = geneRecord.getTableValue(EXPRESSION_GRAPH_DATA_TABLE); + String geneId = getGeneId(record); + TableValue expressionGraphs = record.getTableValue(EXPRESSION_GRAPH_TABLE); + TableValue expressionGraphsDataTable = record.getTableValue(EXPRESSION_GRAPH_DATA_TABLE); for (TableValueRow experimentRow : expressionGraphs) { @@ -87,11 +98,28 @@ static List processExpressionData(RecordInstance geneRecord, int max experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue()); } - List filteredData = readFilteredData( - experimentRow.getAttributeValue("dataset_id").getValue(), expressionGraphsDataTable); + String datasetId = experimentRow.getAttributeValue("dataset_id").getValue(); + + List filteredData = readFilteredData(datasetId, expressionGraphsDataTable); experimentInfo.put("data", filteredData); - experiments.add(experimentInfo); + + experiments.put(datasetId, new ExperimentInputs() { + @Override + public String getCacheKey() { + return geneId + ':' + datasetId; + } + + @Override + public String getDigest() { + return EncryptionUtil.md5(getExperimentPrompt.apply(getExperimentData())); + } + + @Override + public JSONObject getExperimentData() { + return experimentInfo; + } + }); if (maxExperiments > 0 && experiments.size() >= maxExperiments) break; @@ -122,4 +150,5 @@ private static List readFilteredData(String datasetId, TableValue ex } return filteredData; } + } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 26b1c3293..865d78457 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -11,7 +11,10 @@ import java.util.concurrent.CompletableFuture; import java.util.stream.Collectors; -import org.apidb.apicommon.model.report.ai.CacheMode; +import org.apache.log4j.Logger; +import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs; +import org.gusdb.fgputil.json.JsonUtil; +import org.gusdb.wdk.model.WdkModel; import org.gusdb.wdk.model.WdkModelException; import org.gusdb.wdk.model.WdkUserException; import org.gusdb.wdk.model.record.RecordInstance; @@ -30,26 +33,12 @@ public class Summarizer { - private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder() - .fromEnv() // Uses OPENAI_API_KEY from env - .maxRetries(32) // Handle 429 errors - .build(); - - private static final AiExpressionCache cache; + private static final Logger LOG = Logger.getLogger(Summarizer.class); - static { - AiExpressionCache tempCache = null; - try { - tempCache = new AiExpressionCache(); - } catch (IOException e) { - throw new RuntimeException("Failed to initialize AiExpressionCache", e); - } - cache = tempCache; - } - // provide exact model number for semi-reproducibility - private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; - private static int MAX_RESPONSE_TOKENS = 10000; + public static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; + + private static final int MAX_RESPONSE_TOKENS = 10000; private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data"; @@ -122,63 +111,74 @@ public class Summarizer { ) .build(); - public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException { + private static final String OPENAI_API_KEY_PROP_NAME = "OPENAI_API_KEY"; + + private final OpenAIClientAsync _openAIClient; + + public Summarizer(WdkModel wdkModel) { + _openAIClient = OpenAIOkHttpClientAsync.builder() + .apiKey(wdkModel.getProperties().get(OPENAI_API_KEY_PROP_NAME)) + .maxRetries(32) // Handle 429 errors + .build(); + } + + public JSONObject summarizeExpression(GeneSummaryInputs summaryInputs, boolean populateIfNotPresent) + throws WdkUserException { try { - String geneId = geneRecord.getAttributeValue("gene_id").getValue(); // Process expression data further into a list of pruned metadata plus data - List experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord); + List experimentsWithData = summaryInputs.getExperimentsWithData(); + String geneId = summaryInputs.getGeneId(); System.out.println("Pre-processed Experiments: " + experimentsWithData.size()); - // TEST Mode: Collect valid cache entries - if (cacheMode == CacheMode.TEST) { - List cachedResponses = new ArrayList<>(); - - for (JSONObject experiment : experimentsWithData) { - - Optional experimentSummary = Summarizer.sendExperimentToOpenAI(geneId, experiment, CacheMode.TEST).join(); - - if (experimentSummary.isPresent()) { + if (!populateIfNotPresent) { + List cachedResponses = new ArrayList<>(); + + for (JSONObject experiment : experimentsWithData) { + + Optional experimentSummary = sendExperimentToOpenAI(geneId, experiment, populateIfNotPresent).join(); + + if (experimentSummary.isPresent()) { cachedResponses.add(experimentSummary.get()); - } else { + } + else { return new JSONObject().put("cacheStatus", "miss"); // If any cache entry is missing, return early - } - } - - // All experiment-level caches are valid, now check final summary cache - Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, CacheMode.TEST); - return finalSummary - .map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary)) - .orElseGet(() -> new JSONObject().put("cacheStatus", "miss")); - } + } + } + // All experiment-level caches are valid, now check final summary cache + Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, populateIfNotPresent); + + return finalSummary.map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", + summary)).orElseGet(() -> new JSONObject().put("cacheStatus", "miss")); + } // Send AI requests in parallel List>> aiRequests = experimentsWithData.stream() - // TO DO - potentially some optimisation? - // .map(exp -> CompletableFuture.supplyAsync(() -> sendExperimentToOpenAI(geneId, exp, CacheMode.POPULATE))) - .map(exp -> sendExperimentToOpenAI(geneId, exp, CacheMode.POPULATE)) - .collect(Collectors.toList()); + // TO DO - potentially some optimisation? + // .map(exp -> CompletableFuture.supplyAsync(() -> sendExperimentToOpenAI(geneId, exp, + // CacheMode.POPULATE))) + .map(exp -> sendExperimentToOpenAI(geneId, exp, populateIfNotPresent)).collect(Collectors.toList()); // Wait for all requests to complete with `join` - List responses = aiRequests.stream() - .map(CompletableFuture::join) // Get Optional - .filter(Optional::isPresent) // Keep only non-empty results - .map(Optional::get) // Extract JSONObject - .collect(Collectors.toList()); + List responses = aiRequests.stream().map(CompletableFuture::join) // Get + // Optional + .filter(Optional::isPresent) // Keep only non-empty results + .map(Optional::get) // Extract JSONObject + .collect(Collectors.toList()); // Debug output // System.out.println("Individual responses:"); // responses.forEach(response -> System.out.println(response.toString(2))); // System.exit(0); - - Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, CacheMode.POPULATE); - return finalSummary - .map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary)) - .orElseGet(() -> new JSONObject().put("cacheStatus", "miss")); - } catch (WdkModelException e) { + Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, populateIfNotPresent); + return finalSummary.map( + summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary)).orElseGet( + () -> new JSONObject().put("cacheStatus", "miss")); + } + catch (WdkModelException e) { // Handle errors gracefully System.err.println("Error fetching expression data: " + e.getMessage()); @@ -186,125 +186,127 @@ public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMod } } - - private static CompletableFuture> sendExperimentToOpenAI(String geneId, JSONObject experiment, CacheMode cacheMode) { + public static String getExperimentMessage(JSONObject experiment) { // Possible TO DO: AI EDIT DESCRIPTION // Before sending the experiment+data to the AI, ask the AI to edit the `description` field // as follows: (This should be cached by dataset_id only and would be called once per organism // and would reduce tokens and "cognitive load" a little bit for the next step.) // - // "Edit the following text to so that it **only** describes the experimental design of the transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, especially not any genes or groups of genes and their expression behaviour." + // "Edit the following text to so that it **only** describes the experimental design of the + // transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, + // especially not any genes or groups of genes and their expression behaviour." // - // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below. + // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the + // experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt + // below. - // We don't need to send dataset_id to the AI but it's useful to have it // in the response for phase two JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone - String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null; experimentForAI.remove("dataset_id"); - - String message = String.format("The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" + - "```json\n%s\n```\n\n", experimentForAI.toString()) + - "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" + - "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" + - "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields."; - // System.out.println(message); /// DEBUG + return + "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" + + String.format("```json\n%s\n```\n\n", JsonUtil.serialize(experimentForAI)) + + "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" + + "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" + + "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields."; + } + + private CompletableFuture> sendExperimentToOpenAI(String geneId, + JSONObject experiment, boolean populateIfNotPresent) { String cacheKey = geneId + ':' + datasetId; - if (cache.isCacheValid(cacheKey, message)) { + if (_cache.isCacheValid(cacheKey, message)) { try { - JSONObject cachedResponse = cache.readCachedData(cacheKey); - return CompletableFuture.completedFuture(Optional.of(cachedResponse)); - } catch (Exception e) { - System.err.println("Cache read failed for key " + cacheKey + ": " + e.getMessage()); - - if (cacheMode == CacheMode.TEST) { - return CompletableFuture.completedFuture(Optional.empty()); // Treat as cache miss - } - // Else, log and fall through to AI generation + JSONObject cachedResponse = cache.readCachedData(cacheKey); + return CompletableFuture.completedFuture(Optional.of(cachedResponse)); + } + catch (Exception e) { + System.err.println("Cache read failed for key " + cacheKey + ": " + e.getMessage()); + + if (!populateIfNotPresent) { + return CompletableFuture.completedFuture(Optional.empty()); // Treat as cache miss + } + // Else, log and fall through to AI generation } - } else if (cacheMode == CacheMode.TEST) { + } + else if (!populateIfNotPresent) { return CompletableFuture.completedFuture(Optional.empty()); } - ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() - .model(model) - .maxCompletionTokens(MAX_RESPONSE_TOKENS) - .responseFormat(ResponseFormatJsonSchema.builder() - .jsonSchema(JsonSchema.builder() - .name("experiment-summary") - .schema(experimentResponseSchema) - .build()) - .build()) - .addSystemMessage(systemMessage) - .addUserMessage(message) - .build(); + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder().model( + OPENAI_CHAT_MODEL).maxCompletionTokens(MAX_RESPONSE_TOKENS).responseFormat( + ResponseFormatJsonSchema.builder().jsonSchema( + JsonSchema.builder().name("experiment-summary").schema( + experimentResponseSchema).build()).build()).addSystemMessage( + systemMessage).addUserMessage(message).build(); // add dataset_id back to the response - return openAIClient.chat().completions().create(request) - .thenApply(completion -> { - // response is a JSON string - String jsonString = completion.choices().get(0).message().content().get(); - try { - JSONObject jsonObject = new JSONObject(jsonString); - jsonObject.put("dataset_id", datasetId); - - // Cache the response - try { - cache.populateCache(cacheKey, message, jsonObject); - } catch (Exception e) { - System.err.println("Warning: Failed to cache response for gene " + geneId + - " and dataset " + datasetId + ": " + e.getMessage()); - } - - return Optional.of(jsonObject); - } catch (JSONException e) { - System.err.println("Error parsing JSON response for gene " + geneId + " and dataset " + datasetId + ": " + e.getMessage()); - System.err.println("Raw response: " + jsonString); - JSONObject errorResponse = new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId); - return Optional.of(errorResponse); - } - }); + return _openAIClient.chat().completions().create(request).thenApply(completion -> { + // response is a JSON string + String jsonString = completion.choices().get(0).message().content().get(); + try { + JSONObject jsonObject = new JSONObject(jsonString); + jsonObject.put("dataset_id", datasetId); + + // Cache the response + try { + cache.populateCache(cacheKey, message, jsonObject); + } + catch (Exception e) { + System.err.println("Warning: Failed to cache response for gene " + geneId + " and dataset " + + datasetId + ": " + e.getMessage()); + } + + return Optional.of(jsonObject); + } + catch (JSONException e) { + System.err.println("Error parsing JSON response for gene " + geneId + " and dataset " + datasetId + + ": " + e.getMessage()); + System.err.println("Raw response: " + jsonString); + JSONObject errorResponse = new JSONObject().put("error", "Invalid JSON response").put("dataset_id", + datasetId); + return Optional.of(errorResponse); + } + }); } + private Optional sendExperimentSummariesToOpenAI(String geneId, + List experiments, boolean populateIfNotPresent) { + + String message = String.format( + "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + + "```json\n%s\n```\n\n", + new JSONArray(experiments).toString()) + + "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response."; - private static Optional sendExperimentSummariesToOpenAI(String geneId, List experiments, CacheMode cacheMode) { - - String message = String.format("Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + - "```json\n%s\n```\n\n", new JSONArray(experiments).toString()) + - "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response."; - - ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() - .model(model) - .maxCompletionTokens(MAX_RESPONSE_TOKENS) - .responseFormat(ResponseFormatJsonSchema.builder() - .jsonSchema(JsonSchema.builder() - .name("expression-summary") - .schema(finalResponseSchema) - .build()) - .build()) - .addSystemMessage(systemMessage) - .addUserMessage(message) - .build(); + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder().model( + OPENAI_CHAT_MODEL).maxCompletionTokens(MAX_RESPONSE_TOKENS).responseFormat( + ResponseFormatJsonSchema.builder().jsonSchema( + JsonSchema.builder().name("expression-summary").schema( + finalResponseSchema).build()).build()).addSystemMessage(systemMessage).addUserMessage( + message).build(); // System.out.println(message); - ChatCompletion completion = openAIClient.chat().completions().create(request).join(); // join() waits for the async response + ChatCompletion completion = _openAIClient.chat().completions().create(request).join(); // join() waits for + // the async + // response String jsonString = completion.choices().get(0).message().content().get(); JSONObject rawResponseObject = new JSONObject(jsonString); - // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI + // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by + // AI JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments); - + return Optional.of(finalResponseObject); } - - public static JSONObject consolidateSummary(JSONObject summaryResponse, List individualResults) { + public static JSONObject consolidateSummary(JSONObject summaryResponse, + List individualResults) { // Gather all dataset IDs from individualResults and map them to summaries Map datasetSummaries = new HashMap<>(); for (JSONObject result : individualResults) { @@ -322,19 +324,21 @@ public static JSONObject consolidateSummary(JSONObject summaryResponse, List Date: Fri, 21 Feb 2025 10:04:03 -0500 Subject: [PATCH 11/31] Checkpoint commit; finished up AiExpressionCache and just need to trim down Summarizer --- .../ai/SingleGeneAiExpressionReporter.java | 20 +- .../ai/expression/AiExpressionCache.java | 287 ++++++++++++++---- .../ai/expression/GeneRecordProcessor.java | 26 +- .../report/ai/expression/Summarizer.java | 19 +- 4 files changed, 262 insertions(+), 90 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index ee0eb86eb..8686e5ad1 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -11,10 +11,9 @@ import org.apidb.apicommon.model.TranscriptUtil; import org.apidb.apicommon.model.report.ai.expression.AiExpressionCache; import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor; -import org.apidb.apicommon.model.report.ai.expression.Summarizer; import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs; +import org.apidb.apicommon.model.report.ai.expression.Summarizer; import org.gusdb.wdk.model.WdkModelException; -import org.gusdb.wdk.model.WdkUserException; import org.gusdb.wdk.model.answer.stream.RecordStream; import org.gusdb.wdk.model.answer.stream.RecordStreamFactory; import org.gusdb.wdk.model.record.RecordClass; @@ -86,8 +85,8 @@ protected void write(OutputStream out) throws IOException, WdkModelException { // fetch summary, producing if necessary and requested JSONObject expressionSummary = _populateIfNotPresent - ? getSummary(summaryInputs, summarizer, cache) - : readSummary(summaryInputs, cache); + ? cache.populateSummary(summaryInputs, summarizer::describeExperiment, summarizer::summarizeExperiments) + : cache.readSummary(summaryInputs); // join entries with commas if (firstRecord) firstRecord = false; else writer.write(","); @@ -98,17 +97,4 @@ protected void write(OutputStream out) throws IOException, WdkModelException { } } } - - private JSONObject getSummary(GeneSummaryInputs summaryInputs, Summarizer summarizer, AiExpressionCache cache) { - try { - - } - } - - private JSONObject readSummary(GeneSummaryInputs summaryInputs, AiExpressionCache cache) { - try { - - } - } - } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java index 1b48ea698..a34f07103 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -4,28 +4,61 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; import java.util.Optional; -import java.util.Set; +import java.util.concurrent.CompletableFuture; +import java.util.function.Predicate; +import org.apache.log4j.Logger; +import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.ExperimentInputs; import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs; import org.gusdb.fgputil.cache.disk.OnDiskCache; import org.gusdb.fgputil.cache.disk.OnDiskCache.EntryNotCreatedException; -import org.gusdb.fgputil.cache.disk.OnDiskCache.Overwrite; import org.gusdb.fgputil.functional.FunctionalInterfaces.ConsumerWithException; import org.gusdb.fgputil.functional.FunctionalInterfaces.FunctionWithException; +import org.gusdb.fgputil.functional.FunctionalInterfaces.PredicateWithException; +import org.gusdb.fgputil.functional.FunctionalInterfaces.SupplierWithException; import org.gusdb.wdk.model.WdkModel; import org.json.JSONException; import org.json.JSONObject; public class AiExpressionCache { - // constants to determine cache location + private static Logger LOG = Logger.getLogger(AiExpressionCache.class); + + // cache location private static final String CACHE_DIR_PROP_NAME = "AI_EXPRESSION_CACHE_DIR"; private static final String DEFAULT_TMP_CACHE_SUBDIR = "expressionCache"; + // catch characteristics private static final long DEFAULT_TIMEOUT_MILLIS = 5000; private static final long DEFAULT_POLL_FREQUENCY_MILLIS = 500; + // cache filenames + private static final String CACHED_DATA_FILE = "cached_data.txt"; + private static final String CACHE_DIGEST_FILE = "digest.txt"; + + // returned JSON props and values + private static final String CACHE_STATUS = "cacheStatus"; // hit or miss + private static final String CACHE_HIT = "hit"; + private static final String HIT_RESULT = "expressionSummary"; // if hit, will have result + private static final String CACHE_MISS = "miss"; + private static final String MISS_REASON = "reason"; // if miss, will have reason + + // status messages + private static class LookupException extends Exception { + public static final LookupException EXPIRED_ENTRY = new LookupException("Expired entry"); + public static final LookupException CORRUPTED_ENTRY = new LookupException("Corrupted entry"); + public static final LookupException MISSING_ENTRY = new LookupException("Missing entry"); + private LookupException(String msg) { super(msg); } + public JSONObject toJson() { + return new JSONObject() + .put(CACHE_STATUS, CACHE_MISS) + .put(MISS_REASON, getMessage()); + } + } + // singleton pattern private static AiExpressionCache _instance; @@ -40,9 +73,11 @@ else if (_instance._wdkModel != wdkModel) { return _instance; } + // private fields private final WdkModel _wdkModel; private final OnDiskCache _cache; + // constructor public AiExpressionCache(WdkModel wdkModel) throws IOException { _wdkModel = wdkModel; @@ -52,83 +87,221 @@ public AiExpressionCache(WdkModel wdkModel) throws IOException { .orElse(Paths.get(_wdkModel.getModelConfig().getWdkTempDir().toString(), DEFAULT_TMP_CACHE_SUBDIR)); _cache = new OnDiskCache(cacheParentDir, DEFAULT_TIMEOUT_MILLIS, DEFAULT_POLL_FREQUENCY_MILLIS); + } + /** + * Tries to read a gene summary from the cache without populating if absent. + * + * @param summaryInputs inputs for cache lookup + * @return response JSON (indicating cache hit or not with data or miss reason respectively) + */ + public JSONObject readSummary(GeneSummaryInputs summaryInputs) { + try { + return _cache.visitContent(summaryInputs.getGeneId(), + geneDir -> getValidSummary(geneDir, summaryInputs)); + } + catch (LookupException e) { + return e.toJson(); + } + catch (EntryNotCreatedException e) { + return LookupException.MISSING_ENTRY.toJson(); + } + catch (Exception e) { + // any other exception is a 500 + throw e instanceof RuntimeException ? (RuntimeException)e : new RuntimeException(e); + } } - public void blah() { - _cache.populateAndProcessContent(geneId, populator, visitor, overwritePredicate) + /** + * Confirms experiment descriptors are present and up to date with the inputs; if so, + * confirms summary is up to date with the inputs; if so, returns it. If anything is + * missing or out of date, returns cache-miss JSON. + * + * @param geneDir directory for the summary entry + * @param summaryInputs inputs + * @return response JSON (indicating cache hit or not with data or miss reason respectively) + * @throws Exception lookup or other exception if unable to find or validate cached data + */ + private JSONObject getValidSummary(Path geneDir, GeneSummaryInputs summaryInputs) throws Exception { + + // check for existence of valid cache entries for each experiment + // if any are missing or expired, exception will be thrown causing a cache miss + for (ExperimentInputs datasetInput : summaryInputs.getExperimentsWithData()) { + _cache.visitContent(datasetInput.getCacheKey(), experimentDir -> { + return getValidStoredData(experimentDir, datasetInput.getDigest()); + }); + } + + // once all experiment values are confirmed, check for valid summary entry + JSONObject summary = getValidStoredData(geneDir, summaryInputs.getDigest()); + return new JSONObject() + .put(CACHE_STATUS, CACHE_HIT) + .put(HIT_RESULT, summary); } - // Check if cached data is valid - public boolean isCacheValid(GeneSummaryInputs summaryInputs) { + /** + * Checks an entry for a valid digest and readable data file; if valid and present, returns + * parsed JSON data + * + * @param entryDir directory of the entry (could be summary or experiment) + * @param computedDigest expected digest; mismatch indicates cache entry is expired + * @return JSON data for this entry + * @throws IOException if unable to read files from disk + * @throws LookupException if entry is expired or corrupted + */ + private static JSONObject getValidStoredData(Path entryDir, String computedDigest) throws IOException, LookupException { + + // 1. check digest against existing value + if (!digestsMatch(entryDir, computedDigest)) { + throw LookupException.EXPIRED_ENTRY; + } + + // 2. check for presence of cached data, then read + return readCachedData(entryDir) + .orElseThrow(() -> LookupException.CORRUPTED_ENTRY); + } + + /** + * Checks if contents of digest file in the passed entry dir match a passed + * computed digest; returns false if file is missing or digests don't match, else true\ + * + * @param entryDir entry directory + * @param computedDigest digest to which existing digest should be compared + * @return whether digests match + * @throws IOException if unable to read file + */ + private static boolean digestsMatch(Path entryDir, String computedDigest) throws IOException { + Path digestFile = entryDir.resolve(CACHE_DIGEST_FILE); + return Files.exists(digestFile) && + Files.readString(digestFile).equals(computedDigest); + } + + /** + * Read cached data file from entry, returns empty optional if data file + * does not exist or is unable to read or parsed into JSON. + * + * @param entryDir entry directory + * @return optional entry data + */ + private static Optional readCachedData(Path entryDir) { try { - FunctionWithException visitor = entryDir -> { - Path digestFile = entryDir.resolve("digest.txt"); + Path file = entryDir.resolve(CACHED_DATA_FILE); + return Files.exists(file) + ? Optional.of(new JSONObject(Files.readString(file))) + : Optional.empty(); + } + catch (IOException | JSONException e) { + LOG.error("Unable to read or parse cached data", e); + return Optional.empty(); + } + } - if (!Files.exists(digestFile)) { - System.out.println("No digest file found."); - return false; - } + /** + * Returns a cached gene expression summary, generating and storing a new value if none + * exists or if the existing value is out of date with the passed digests. + * + * @param summaryInputs gene summary inputs + * @param experimentDescriber function to describe an experiment + * @param experimentSummarizer function to summarize experiments into an expression summary + * @return expression summary (will always be a cache hit) + */ + public JSONObject populateSummary(GeneSummaryInputs summaryInputs, + FunctionWithException> experimentDescriber, + FunctionWithException, JSONObject> experimentSummarizer) { + try { + return _cache.populateAndProcessContent(summaryInputs.getGeneId(), - // Read stored digest and compare - String cachedDigest = Files.readString(digestFile); + // populator + entryDir -> { + // first populate each dataset entry as needed and collect experiment descriptors + List experiments = populateExperiments(summaryInputs.getExperimentsWithData(), experimentDescriber); - if (cachedDigest.equals(summaryInputs.getExperimentsDigest())) { - System.out.println("Cache digest matches input."); - return true; - } - else { - System.out.println("Cache digest mismatch! Cache is out of date."); - return false; - } - }; + // summarize experiments and store + getPopulator(summaryInputs.getDigest(), () -> experimentSummarizer.apply(experiments)).accept(entryDir); + }, - return _cache.visitContent(summaryInputs.getGeneId(), visitor); + // visitor + entryDir -> getValidSummary(entryDir, summaryInputs), - } - catch (EntryNotCreatedException e) { - System.out.println("Cache entry does not exist yet."); - return false; + // repopulation predicate + exceptionToTrue(entryDir -> + // try to look up summary json; if cache miss, then repopulate + getValidSummary(entryDir, summaryInputs).getString(CACHE_STATUS).equals(CACHE_MISS))); } catch (Exception e) { - throw new RuntimeException("Error validating cache entry", e); + // any other exception is a 500 + throw e instanceof RuntimeException ? (RuntimeException)e : new RuntimeException(e); } } - // Populate cache with computed data (Method 1: Takes computedData directly) - public void populateCache(GeneSummaryInputs summaryInputs, JSONObject computedData) throws Exception { - ConsumerWithException populator = entryDir -> { - Files.writeString(entryDir.resolve("cached_data.txt"), computedData.toString()); - Files.writeString(entryDir.resolve("digest.txt"), summaryInputs.getExperimentsDigest()); - }; + /** + * Returns a set of cached experiment descriptions, generating and storing new values for any + * experiments not present or that are out of date (mismatched digests). In this way, any new + * experiments do not result in regeneration of descriptors for previously released experiments. + * + * @param experimentData experiment inputs + * @param experimentDescriber function to describe an experiment + * @return list of cached experiment descriptions + * @throws Exception if unable to generate descriptions or store + */ + private List populateExperiments(List experimentData, + FunctionWithException> experimentDescriber) throws Exception { + List experiments = new ArrayList<>(); + // start with serial generation; move back to parallel later + for (ExperimentInputs input : experimentData) { + experiments.add(_cache.populateAndProcessContent(input.getCacheKey(), - // Populate with overwrite policy (assumes caller ensures it's necessary) - _cache.populateAndProcessContent(summaryInputs.getGeneId(), populator, path -> null, Overwrite.YES); - } + // populator + getPopulator(input.getDigest(), () -> experimentDescriber.apply(input.getExperimentData()).get()), -// // Populate cache with a function that computes the result (Method 2: Uses a function) -// public void populateCache(String cacheKey, String inputData, Function computation) throws Exception { -// populateCache(cacheKey, inputData, computation.apply(inputData)); -// } + // visitor + experimentDir -> getValidStoredData(experimentDir, input.getDigest()), + // repopulation predicate + exceptionToTrue(experimentDir -> { + getValidStoredData(experimentDir, input.getDigest()); + return false; // do not repopulate if able to look up valid value + }) + )); + } + return experiments; + } - // Read cached data (throws IOException if missing) - public JSONObject readCachedData(String cacheKey) throws Exception { - FunctionWithException visitor = entryDir -> { - Path file = entryDir.resolve("cached_data.txt"); - if (!Files.exists(file)) { - throw new IOException("Cache entry missing: " + file); - } - String fileContents = Files.readString(file); + /** + * Takes a predicate that throws an exception and returns a predicate that + * does not, converting any thrown exception to true + * + * @param predicate predicate that throws an exception + * @return the value returned by the passed predicate, or true if an exception is thrown + */ + private Predicate exceptionToTrue(PredicateWithException predicate) { + return path -> { try { - JSONObject jsonObject = new JSONObject(fileContents); - return jsonObject; - } catch (JSONException e) { - throw e; + return predicate.test(path); + } + catch (Exception e) { + return true; } }; - - return _cache.visitContent(cacheKey, visitor); + } + + /** + * Returns a function that populates a cache entry with the passed + * digest and with data supplied by the passed supplier. + * + * @param digest digest to store + * @param dataSupplier supplier of data to store + * @return population function + */ + private ConsumerWithException getPopulator(String digest, SupplierWithException dataSupplier) { + return entryDir -> { + + // write digest to digest file + Files.writeString(entryDir.resolve(CACHE_DIGEST_FILE), digest); + + // write data + Files.writeString(entryDir.resolve(CACHED_DATA_FILE), dataSupplier.get().toString()); + }; } } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index 807cec0f3..9a2fc211c 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -1,9 +1,7 @@ package org.apidb.apicommon.model.report.ai.expression; import java.util.ArrayList; -import java.util.LinkedHashMap; import java.util.List; -import java.util.Map; import java.util.Set; import java.util.function.Function; import java.util.stream.Collectors; @@ -37,6 +35,8 @@ public interface ExperimentInputs { String getCacheKey(); + String getDatasetId(); + String getDigest(); JSONObject getExperimentData(); @@ -46,11 +46,11 @@ public interface GeneSummaryInputs { String getGeneId(); // is the cache key - Map getExperimentsWithData(); + List getExperimentsWithData(); - default String getExperimentsDigest() { + default String getDigest() { // TODO Does it make more sense to md5 the concatenation of the experiment hashes? - return EncryptionUtil.md5(getExperimentsWithData().values().stream() + return EncryptionUtil.md5(getExperimentsWithData().stream() .map(ExperimentInputs::getExperimentData) .map(JsonUtil::serialize) .collect(Collectors.joining())); @@ -65,7 +65,7 @@ public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record String geneId = getGeneId(record); - Map experimentsWithData = GeneRecordProcessor.processExpressionData(record, experimentDigester, 0); + List experimentsWithData = GeneRecordProcessor.processExpressionData(record, experimentDigester, 0); return new GeneSummaryInputs() { @Override @@ -74,16 +74,16 @@ public String getGeneId() { } @Override - public Map getExperimentsWithData() { + public List getExperimentsWithData() { return experimentsWithData; } }; } - private static Map processExpressionData(RecordInstance record, Function getExperimentPrompt, int maxExperiments) throws WdkModelException { + private static List processExpressionData(RecordInstance record, Function getExperimentPrompt, int maxExperiments) throws WdkModelException { try { // return value: - Map experiments = new LinkedHashMap<>(); + List experiments = new ArrayList<>(); String geneId = getGeneId(record); TableValue expressionGraphs = record.getTableValue(EXPRESSION_GRAPH_TABLE); @@ -104,7 +104,13 @@ private static Map processExpressionData(RecordInstanc experimentInfo.put("data", filteredData); - experiments.put(datasetId, new ExperimentInputs() { + experiments.add(new ExperimentInputs() { + + @Override + public String getDatasetId() { + return datasetId; + } + @Override public String getCacheKey() { return geneId + ':' + datasetId; diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 865d78457..c348a6d22 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -151,8 +151,12 @@ public JSONObject summarizeExpression(GeneSummaryInputs summaryInputs, boolean p // All experiment-level caches are valid, now check final summary cache Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, populateIfNotPresent); - return finalSummary.map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", - summary)).orElseGet(() -> new JSONObject().put("cacheStatus", "miss")); + return finalSummary + .map(summary -> new JSONObject() + .put("cacheStatus", "hit") + .put("expressionSummary", summary)) + .orElseGet(() -> new JSONObject() + .put("cacheStatus", "miss")); } // Send AI requests in parallel @@ -174,9 +178,12 @@ public JSONObject summarizeExpression(GeneSummaryInputs summaryInputs, boolean p // System.exit(0); Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, populateIfNotPresent); - return finalSummary.map( - summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary)).orElseGet( - () -> new JSONObject().put("cacheStatus", "miss")); + return finalSummary + .map(summary -> new JSONObject() + .put("cacheStatus", "hit") + .put("expressionSummary", summary)) + .orElseGet(() -> new JSONObject() + .put("cacheStatus", "miss")); } catch (WdkModelException e) { @@ -275,7 +282,7 @@ else if (!populateIfNotPresent) { } private Optional sendExperimentSummariesToOpenAI(String geneId, - List experiments, boolean populateIfNotPresent) { + List experiments) { String message = String.format( "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + From be6a61a21f376510315a470d446766d8edacb7bf Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Fri, 21 Feb 2025 15:44:36 -0500 Subject: [PATCH 12/31] Clean cache logic out of summarizer --- .../ai/SingleGeneAiExpressionReporter.java | 2 +- .../ai/expression/AiExpressionCache.java | 6 +- .../report/ai/expression/Summarizer.java | 203 +++++------------- 3 files changed, 55 insertions(+), 156 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index 8686e5ad1..e3e5808ce 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -91,7 +91,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException { // join entries with commas if (firstRecord) firstRecord = false; else writer.write(","); - // write JSON object + // write JSON object property, keyed by gene ID writer.write("\"" + summaryInputs.getGeneId() + "\":" + expressionSummary.toString()); } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java index a34f07103..da73895d3 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -206,7 +206,7 @@ private static Optional readCachedData(Path entryDir) { * @return expression summary (will always be a cache hit) */ public JSONObject populateSummary(GeneSummaryInputs summaryInputs, - FunctionWithException> experimentDescriber, + FunctionWithException> experimentDescriber, FunctionWithException, JSONObject> experimentSummarizer) { try { return _cache.populateAndProcessContent(summaryInputs.getGeneId(), @@ -245,14 +245,14 @@ public JSONObject populateSummary(GeneSummaryInputs summaryInputs, * @throws Exception if unable to generate descriptions or store */ private List populateExperiments(List experimentData, - FunctionWithException> experimentDescriber) throws Exception { + FunctionWithException> experimentDescriber) throws Exception { List experiments = new ArrayList<>(); // start with serial generation; move back to parallel later for (ExperimentInputs input : experimentData) { experiments.add(_cache.populateAndProcessContent(input.getCacheKey(), // populator - getPopulator(input.getDigest(), () -> experimentDescriber.apply(input.getExperimentData()).get()), + getPopulator(input.getDigest(), () -> experimentDescriber.apply(input).get()), // visitor experimentDir -> getValidStoredData(experimentDir, input.getDigest()), diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index c348a6d22..cf20e7f20 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -1,23 +1,15 @@ package org.apidb.apicommon.model.report.ai.expression; -import java.io.IOException; -import java.util.ArrayList; import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Optional; import java.util.Set; import java.util.concurrent.CompletableFuture; -import java.util.stream.Collectors; -import org.apache.log4j.Logger; -import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs; +import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.ExperimentInputs; import org.gusdb.fgputil.json.JsonUtil; import org.gusdb.wdk.model.WdkModel; -import org.gusdb.wdk.model.WdkModelException; -import org.gusdb.wdk.model.WdkUserException; -import org.gusdb.wdk.model.record.RecordInstance; import org.json.JSONArray; import org.json.JSONException; import org.json.JSONObject; @@ -33,14 +25,13 @@ public class Summarizer { - private static final Logger LOG = Logger.getLogger(Summarizer.class); - // provide exact model number for semi-reproducibility - public static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; + // TODO: should this be incorporated into the digests, so if we change the chat model, all generated summaries become expired? + private static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; private static final int MAX_RESPONSE_TOKENS = 10000; - private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data"; + private static final String SYSTEM_MESSAGE = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data"; // Prepare JSON schemas for structured responses // NOTE: this code is horrible to look at/read. It would be better to just define the schemas as JSON strings @@ -122,77 +113,6 @@ public Summarizer(WdkModel wdkModel) { .build(); } - public JSONObject summarizeExpression(GeneSummaryInputs summaryInputs, boolean populateIfNotPresent) - throws WdkUserException { - - try { - - // Process expression data further into a list of pruned metadata plus data - List experimentsWithData = summaryInputs.getExperimentsWithData(); - String geneId = summaryInputs.getGeneId(); - System.out.println("Pre-processed Experiments: " + experimentsWithData.size()); - - // TEST Mode: Collect valid cache entries - if (!populateIfNotPresent) { - List cachedResponses = new ArrayList<>(); - - for (JSONObject experiment : experimentsWithData) { - - Optional experimentSummary = sendExperimentToOpenAI(geneId, experiment, populateIfNotPresent).join(); - - if (experimentSummary.isPresent()) { - cachedResponses.add(experimentSummary.get()); - } - else { - return new JSONObject().put("cacheStatus", "miss"); // If any cache entry is missing, return early - } - } - - // All experiment-level caches are valid, now check final summary cache - Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, populateIfNotPresent); - - return finalSummary - .map(summary -> new JSONObject() - .put("cacheStatus", "hit") - .put("expressionSummary", summary)) - .orElseGet(() -> new JSONObject() - .put("cacheStatus", "miss")); - } - - // Send AI requests in parallel - List>> aiRequests = experimentsWithData.stream() - // TO DO - potentially some optimisation? - // .map(exp -> CompletableFuture.supplyAsync(() -> sendExperimentToOpenAI(geneId, exp, - // CacheMode.POPULATE))) - .map(exp -> sendExperimentToOpenAI(geneId, exp, populateIfNotPresent)).collect(Collectors.toList()); - // Wait for all requests to complete with `join` - List responses = aiRequests.stream().map(CompletableFuture::join) // Get - // Optional - .filter(Optional::isPresent) // Keep only non-empty results - .map(Optional::get) // Extract JSONObject - .collect(Collectors.toList()); - - // Debug output - // System.out.println("Individual responses:"); - // responses.forEach(response -> System.out.println(response.toString(2))); - // System.exit(0); - - Optional finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, populateIfNotPresent); - return finalSummary - .map(summary -> new JSONObject() - .put("cacheStatus", "hit") - .put("expressionSummary", summary)) - .orElseGet(() -> new JSONObject() - .put("cacheStatus", "miss")); - } - catch (WdkModelException e) { - - // Handle errors gracefully - System.err.println("Error fetching expression data: " + e.getMessage()); - throw new WdkUserException(e); - } - } - public static String getExperimentMessage(JSONObject experiment) { // Possible TO DO: AI EDIT DESCRIPTION @@ -221,35 +141,20 @@ public static String getExperimentMessage(JSONObject experiment) { "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields."; } - private CompletableFuture> sendExperimentToOpenAI(String geneId, - JSONObject experiment, boolean populateIfNotPresent) { - - String cacheKey = geneId + ':' + datasetId; - - if (_cache.isCacheValid(cacheKey, message)) { - try { - JSONObject cachedResponse = cache.readCachedData(cacheKey); - return CompletableFuture.completedFuture(Optional.of(cachedResponse)); - } - catch (Exception e) { - System.err.println("Cache read failed for key " + cacheKey + ": " + e.getMessage()); - - if (!populateIfNotPresent) { - return CompletableFuture.completedFuture(Optional.empty()); // Treat as cache miss - } - // Else, log and fall through to AI generation - } - } - else if (!populateIfNotPresent) { - return CompletableFuture.completedFuture(Optional.empty()); - } - - ChatCompletionCreateParams request = ChatCompletionCreateParams.builder().model( - OPENAI_CHAT_MODEL).maxCompletionTokens(MAX_RESPONSE_TOKENS).responseFormat( - ResponseFormatJsonSchema.builder().jsonSchema( - JsonSchema.builder().name("experiment-summary").schema( - experimentResponseSchema).build()).build()).addSystemMessage( - systemMessage).addUserMessage(message).build(); + public CompletableFuture describeExperiment(ExperimentInputs experimentInputs) { + + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() + .model(OPENAI_CHAT_MODEL) + .maxCompletionTokens(MAX_RESPONSE_TOKENS) + .responseFormat(ResponseFormatJsonSchema.builder() + .jsonSchema(JsonSchema.builder() + .name("experiment-summary") + .schema(experimentResponseSchema) + .build()) + .build()) + .addSystemMessage(SYSTEM_MESSAGE) + .addUserMessage(getExperimentMessage(experimentInputs.getExperimentData())) + .build(); // add dataset_id back to the response return _openAIClient.chat().completions().create(request).thenApply(completion -> { @@ -257,62 +162,56 @@ else if (!populateIfNotPresent) { String jsonString = completion.choices().get(0).message().content().get(); try { JSONObject jsonObject = new JSONObject(jsonString); - jsonObject.put("dataset_id", datasetId); - - // Cache the response - try { - cache.populateCache(cacheKey, message, jsonObject); - } - catch (Exception e) { - System.err.println("Warning: Failed to cache response for gene " + geneId + " and dataset " + - datasetId + ": " + e.getMessage()); - } - - return Optional.of(jsonObject); + jsonObject.put("dataset_id", experimentInputs.getDatasetId()); + return jsonObject; } catch (JSONException e) { - System.err.println("Error parsing JSON response for gene " + geneId + " and dataset " + datasetId + - ": " + e.getMessage()); - System.err.println("Raw response: " + jsonString); - JSONObject errorResponse = new JSONObject().put("error", "Invalid JSON response").put("dataset_id", - datasetId); - return Optional.of(errorResponse); + throw new RuntimeException( + "Error parsing JSON response for dataset " + experimentInputs.getDatasetId() + + ". Raw response string:\n" + jsonString + "\n", e); } }); } - private Optional sendExperimentSummariesToOpenAI(String geneId, - List experiments) { + public JSONObject summarizeExperiments(List experiments) { - String message = String.format( + String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + - "```json\n%s\n```\n\n", - new JSONArray(experiments).toString()) + + String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) + "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response."; - ChatCompletionCreateParams request = ChatCompletionCreateParams.builder().model( - OPENAI_CHAT_MODEL).maxCompletionTokens(MAX_RESPONSE_TOKENS).responseFormat( - ResponseFormatJsonSchema.builder().jsonSchema( - JsonSchema.builder().name("expression-summary").schema( - finalResponseSchema).build()).build()).addSystemMessage(systemMessage).addUserMessage( - message).build(); + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() + .model(OPENAI_CHAT_MODEL) + .maxCompletionTokens(MAX_RESPONSE_TOKENS) + .responseFormat(ResponseFormatJsonSchema.builder() + .jsonSchema(JsonSchema.builder() + .name("expression-summary") + .schema(finalResponseSchema) + .build()) + .build()) + .addSystemMessage(SYSTEM_MESSAGE) + .addUserMessage(message) + .build(); - // System.out.println(message); + ChatCompletion completion = _openAIClient.chat().completions().create(request) + .join(); // join() waits for the async response - ChatCompletion completion = _openAIClient.chat().completions().create(request).join(); // join() waits for - // the async - // response String jsonString = completion.choices().get(0).message().content().get(); - JSONObject rawResponseObject = new JSONObject(jsonString); + try { + JSONObject rawResponseObject = new JSONObject(jsonString); - // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by - // AI - JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments); + // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI + JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments); - return Optional.of(finalResponseObject); + return finalResponseObject; + } + catch (JSONException e) { + throw new RuntimeException("Error parsing JSON response " + + "for gene summary. Raw response string:\n" + jsonString + "\n", e); + } } - public static JSONObject consolidateSummary(JSONObject summaryResponse, + private static JSONObject consolidateSummary(JSONObject summaryResponse, List individualResults) { // Gather all dataset IDs from individualResults and map them to summaries Map datasetSummaries = new HashMap<>(); From 32a73518ba77ebf6f3f93f9e1bfedaabb9c0d733 Mon Sep 17 00:00:00 2001 From: Bob Date: Sat, 22 Feb 2025 18:23:49 +0000 Subject: [PATCH 13/31] make cache validation digest symmetrical for both levels of AI query --- .../ai/SingleGeneAiExpressionReporter.java | 3 ++- .../ai/expression/GeneRecordProcessor.java | 27 ++++++++++++------- .../report/ai/expression/Summarizer.java | 11 +++++--- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index e3e5808ce..1b80ca346 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -81,7 +81,8 @@ protected void write(OutputStream out) throws IOException, WdkModelException { for (RecordInstance record : recordStream) { // create summary inputs - GeneSummaryInputs summaryInputs = GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer::getExperimentMessage); + GeneSummaryInputs summaryInputs = + GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer::getExperimentMessage, Summarizer::getFinalSummaryMessage); // fetch summary, producing if necessary and requested JSONObject expressionSummary = _populateIfNotPresent diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index 9a2fc211c..ceee1482f 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -48,24 +48,18 @@ public interface GeneSummaryInputs { List getExperimentsWithData(); - default String getDigest() { - // TODO Does it make more sense to md5 the concatenation of the experiment hashes? - return EncryptionUtil.md5(getExperimentsWithData().stream() - .map(ExperimentInputs::getExperimentData) - .map(JsonUtil::serialize) - .collect(Collectors.joining())); - } + String getDigest(); } private static String getGeneId(RecordInstance record) { return record.getPrimaryKey().getValues().get("gene_source_id"); } - public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, Function experimentDigester) throws WdkModelException { + public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, Function getExperimentPrompt, Function, String> getFinalSummaryPrompt) throws WdkModelException { String geneId = getGeneId(record); - List experimentsWithData = GeneRecordProcessor.processExpressionData(record, experimentDigester, 0); + List experimentsWithData = GeneRecordProcessor.processExpressionData(record, getExperimentPrompt, 0); return new GeneSummaryInputs() { @Override @@ -77,6 +71,21 @@ public String getGeneId() { public List getExperimentsWithData() { return experimentsWithData; } + + @Override + public String getDigest() { + // Instead of building the final summary prompt using the AI-generated **summary outputs** + // (which happens during real processing), we construct it using JSON-encoded MD5 + // **digests** of the per-experiment **inputs**. + // + // This avoids fetching per-experiment results from the cache while remaining + // functionally identical for cache validation purposes. + List digests = experimentsWithData.stream() + .map(exp -> new JSONObject().put("digest", exp.getDigest())) + .collect(Collectors.toList()); + return EncryptionUtil.md5(getFinalSummaryPrompt.apply(digests)); + } + }; } diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index cf20e7f20..8a8ae7aa1 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -173,13 +173,16 @@ public CompletableFuture describeExperiment(ExperimentInputs experim }); } - public JSONObject summarizeExperiments(List experiments) { + public static String getFinalSummaryMessage(List experiments) { - String message = - "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + + return "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) + "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response."; + } + + public JSONObject summarizeExperiments(List experiments) { + ChatCompletionCreateParams request = ChatCompletionCreateParams.builder() .model(OPENAI_CHAT_MODEL) .maxCompletionTokens(MAX_RESPONSE_TOKENS) @@ -190,7 +193,7 @@ public JSONObject summarizeExperiments(List experiments) { .build()) .build()) .addSystemMessage(SYSTEM_MESSAGE) - .addUserMessage(message) + .addUserMessage(getFinalSummaryMessage(experiments)) .build(); ChatCompletion completion = _openAIClient.chat().completions().create(request) From 44b65aab7f49474aa691125bb71c5e3287a5e214 Mon Sep 17 00:00:00 2001 From: Bob Date: Sat, 22 Feb 2025 18:49:45 +0000 Subject: [PATCH 14/31] improved code formatting of JSONSchema definitions --- .../report/ai/expression/Summarizer.java | 96 +++++++------------ 1 file changed, 34 insertions(+), 62 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 8a8ae7aa1..3207a0bd0 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -34,72 +34,44 @@ public class Summarizer { private static final String SYSTEM_MESSAGE = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data"; // Prepare JSON schemas for structured responses - // NOTE: this code is horrible to look at/read. It would be better to just define the schemas as JSON strings - // but this is only really nice when we have """ text block """ support, coming soon when we upgrade, perhaps? - private static final JsonSchema.Schema experimentResponseSchema = - JsonSchema.Schema.builder() + private static final JsonSchema.Schema experimentResponseSchema = JsonSchema.Schema.builder() .putAdditionalProperty("type", JsonValue.from("object")) - .putAdditionalProperty("properties", - JsonValue - .from(Map - .of( - "one_sentence_summary", Map.of("type", "string"), - "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5), - "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5), - "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), - "notes", Map.of("type", "string") - ) - ) - ) - .putAdditionalProperty("required", - JsonValue.from( - List.of( - "one_sentence_summary", - "biological_importance", - "confidence", - "experiment_keywords", - "notes") - ) - ) + .putAdditionalProperty("properties", JsonValue.from(Map.of( + "one_sentence_summary", Map.of("type", "string"), + "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5), + "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5), + "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), + "notes", Map.of("type", "string") + ))) + .putAdditionalProperty("required", JsonValue.from(List.of( + "one_sentence_summary", + "biological_importance", + "confidence", + "experiment_keywords", + "notes" + ))) .build(); - private static final JsonSchema.Schema finalResponseSchema = - JsonSchema.Schema.builder() + private static final JsonSchema.Schema finalResponseSchema = JsonSchema.Schema.builder() .putAdditionalProperty("type", JsonValue.from("object")) - .putAdditionalProperty("properties", - JsonValue - .from(Map - .of( - "headline", Map.of("type", "string"), - "one_paragraph_summary", Map.of("type", "string"), - "sections", - Map.of("type", "array", - "minimum", 1, - "items", - Map.of( - "type", "object", - "required", List.of("headline", "one_sentence_summary", "dataset_ids"), - "properties", - Map.of( - "headline", Map.of("type", "string"), - "one_sentence_summary", Map.of("type", "string"), - "dataset_ids", Map.of("type", "array", - "items", Map.of("type", "string")) - ) - ) - ) - ) - ) - ) - .putAdditionalProperty("required", - JsonValue.from( - List.of( - "headline", - "one_paragraph_summary", - "dataset_ids" - ) - ) - ) + .putAdditionalProperty("properties", JsonValue.from(Map.of( + "headline", Map.of("type", "string"), + "one_paragraph_summary", Map.of("type", "string"), + "sections", Map.of("type", "array", "minimum", 1, "items", Map.of( + "type", "object", + "required", List.of("headline", "one_sentence_summary", "dataset_ids"), + "properties", Map.of( + "headline", Map.of("type", "string"), + "one_sentence_summary", Map.of("type", "string"), + "dataset_ids", Map.of("type", "array", "items", Map.of("type", "string")) + ) + )) + ))) + .putAdditionalProperty("required", JsonValue.from(List.of( + "headline", + "one_paragraph_summary", + "dataset_ids" + ))) .build(); private static final String OPENAI_API_KEY_PROP_NAME = "OPENAI_API_KEY"; From 4bcb86f670c4103af5ba8d41700e70ba5ae2085d Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Sun, 23 Feb 2025 21:24:18 -0500 Subject: [PATCH 15/31] Incorporate AI chat model string into result digests; they will become out-of-date if the chat model changes --- .../ai/SingleGeneAiExpressionReporter.java | 3 +- .../ai/expression/GeneRecordProcessor.java | 29 +++++++++---------- .../report/ai/expression/Summarizer.java | 3 +- 3 files changed, 17 insertions(+), 18 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index 1b80ca346..07919351d 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -82,7 +82,8 @@ protected void write(OutputStream out) throws IOException, WdkModelException { // create summary inputs GeneSummaryInputs summaryInputs = - GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer::getExperimentMessage, Summarizer::getFinalSummaryMessage); + GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer.OPENAI_CHAT_MODEL.asString(), + Summarizer::getExperimentMessage, Summarizer::getFinalSummaryMessage); // fetch summary, producing if necessary and requested JSONObject expressionSummary = _populateIfNotPresent diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index ceee1482f..c82dede69 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -7,7 +7,6 @@ import java.util.stream.Collectors; import org.gusdb.fgputil.EncryptionUtil; -import org.gusdb.fgputil.json.JsonUtil; import org.gusdb.wdk.model.WdkModelException; import org.gusdb.wdk.model.WdkUserException; import org.gusdb.wdk.model.record.RecordInstance; @@ -55,11 +54,11 @@ private static String getGeneId(RecordInstance record) { return record.getPrimaryKey().getValues().get("gene_source_id"); } - public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, Function getExperimentPrompt, Function, String> getFinalSummaryPrompt) throws WdkModelException { + public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, String aiChatModel, Function getExperimentPrompt, Function, String> getFinalSummaryPrompt) throws WdkModelException { String geneId = getGeneId(record); - List experimentsWithData = GeneRecordProcessor.processExpressionData(record, getExperimentPrompt, 0); + List experimentsWithData = GeneRecordProcessor.processExpressionData(record, aiChatModel, getExperimentPrompt, 0); return new GeneSummaryInputs() { @Override @@ -74,22 +73,22 @@ public List getExperimentsWithData() { @Override public String getDigest() { - // Instead of building the final summary prompt using the AI-generated **summary outputs** - // (which happens during real processing), we construct it using JSON-encoded MD5 - // **digests** of the per-experiment **inputs**. - // - // This avoids fetching per-experiment results from the cache while remaining - // functionally identical for cache validation purposes. - List digests = experimentsWithData.stream() - .map(exp -> new JSONObject().put("digest", exp.getDigest())) - .collect(Collectors.toList()); - return EncryptionUtil.md5(getFinalSummaryPrompt.apply(digests)); + // Instead of building the final summary prompt using the AI-generated **summary outputs** + // (which happens during real processing), we construct it using JSON-encoded MD5 + // **digests** of the per-experiment **inputs**. + // + // This avoids fetching per-experiment results from the cache while remaining + // functionally identical for cache validation purposes. + List digests = experimentsWithData.stream() + .map(exp -> new JSONObject().put("digest", exp.getDigest())) + .collect(Collectors.toList()); + return EncryptionUtil.md5(aiChatModel + " " + getFinalSummaryPrompt.apply(digests)); } }; } - private static List processExpressionData(RecordInstance record, Function getExperimentPrompt, int maxExperiments) throws WdkModelException { + private static List processExpressionData(RecordInstance record, String aiChatModel, Function getExperimentPrompt, int maxExperiments) throws WdkModelException { try { // return value: List experiments = new ArrayList<>(); @@ -127,7 +126,7 @@ public String getCacheKey() { @Override public String getDigest() { - return EncryptionUtil.md5(getExperimentPrompt.apply(getExperimentData())); + return EncryptionUtil.md5(aiChatModel + " " + getExperimentPrompt.apply(getExperimentData())); } @Override diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 3207a0bd0..d4ff58fe4 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -26,8 +26,7 @@ public class Summarizer { // provide exact model number for semi-reproducibility - // TODO: should this be incorporated into the digests, so if we change the chat model, all generated summaries become expired? - private static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; + public static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06; private static final int MAX_RESPONSE_TOKENS = 10000; From be790187b6945ccac69cdaad5b7df12b083ee64a Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Sun, 23 Feb 2025 22:51:19 -0500 Subject: [PATCH 16/31] Reparallelize experiment lookups --- .../ai/expression/AiExpressionCache.java | 51 +++++++++++++++---- 1 file changed, 42 insertions(+), 9 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java index da73895d3..8c65b7f33 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -1,5 +1,8 @@ package org.apidb.apicommon.model.report.ai.expression; +import static java.util.concurrent.CompletableFuture.supplyAsync; +import static org.gusdb.fgputil.functional.Functions.wrapException; + import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; @@ -8,6 +11,8 @@ import java.util.List; import java.util.Optional; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.function.Predicate; import org.apache.log4j.Logger; @@ -15,6 +20,7 @@ import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs; import org.gusdb.fgputil.cache.disk.OnDiskCache; import org.gusdb.fgputil.cache.disk.OnDiskCache.EntryNotCreatedException; +import org.gusdb.fgputil.functional.Either; import org.gusdb.fgputil.functional.FunctionalInterfaces.ConsumerWithException; import org.gusdb.fgputil.functional.FunctionalInterfaces.FunctionWithException; import org.gusdb.fgputil.functional.FunctionalInterfaces.PredicateWithException; @@ -27,6 +33,9 @@ public class AiExpressionCache { private static Logger LOG = Logger.getLogger(AiExpressionCache.class); + // parallel processing + private static final int MAX_CONCURRENT_EXPERIMENT_LOOKUPS_PER_REQUEST = 5; + // cache location private static final String CACHE_DIR_PROP_NAME = "AI_EXPRESSION_CACHE_DIR"; private static final String DEFAULT_TMP_CACHE_SUBDIR = "expressionCache"; @@ -246,10 +255,17 @@ public JSONObject populateSummary(GeneSummaryInputs summaryInputs, */ private List populateExperiments(List experimentData, FunctionWithException> experimentDescriber) throws Exception { - List experiments = new ArrayList<>(); - // start with serial generation; move back to parallel later - for (ExperimentInputs input : experimentData) { - experiments.add(_cache.populateAndProcessContent(input.getCacheKey(), + + // use a thread for each experiment, up to a reasonable max + int threadPoolSize = Math.min(MAX_CONCURRENT_EXPERIMENT_LOOKUPS_PER_REQUEST, experimentData.size()); + + ExecutorService exec = Executors.newFixedThreadPool(threadPoolSize); + try { + // look up experiment results in parallel, wait for completion, and aggregate results + List> results = new ArrayList<>(); + for (ExperimentInputs input : experimentData) { + + results.add(supplyAsync(() -> wrapException(() -> _cache.populateAndProcessContent(input.getCacheKey(), // populator getPopulator(input.getDigest(), () -> experimentDescriber.apply(input).get()), @@ -259,12 +275,29 @@ private List populateExperiments(List experimentDa // repopulation predicate exceptionToTrue(experimentDir -> { - getValidStoredData(experimentDir, input.getDigest()); - return false; // do not repopulate if able to look up valid value - }) - )); + getValidStoredData(experimentDir, input.getDigest()); + return false; // do not repopulate if able to look up valid value + })) + + ), exec)); + } + + // wait for all threads, filling lists along the way + List descriptors = new ArrayList<>(); + List exceptions = new ArrayList<>(); + for (CompletableFuture result : results) { + result.handle(Either::new).get().ifLeft(descriptors::add).ifRight(exceptions::add); + } + + // if no exceptions occurred, return results; else throw first problem + if (exceptions.isEmpty()) { + return descriptors; + } + throw new RuntimeException(exceptions.get(0)); + } + finally { + exec.shutdown(); } - return experiments; } /** From 18f08707137ab97d6299c1af0eab17f8a070b2aa Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Mon, 24 Feb 2025 10:52:28 -0500 Subject: [PATCH 17/31] increase timeout and use ChatModel.toString() to fix exception --- .../model/report/ai/SingleGeneAiExpressionReporter.java | 2 +- .../apicommon/model/report/ai/expression/AiExpressionCache.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index 07919351d..76a1d83e4 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -82,7 +82,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException { // create summary inputs GeneSummaryInputs summaryInputs = - GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer.OPENAI_CHAT_MODEL.asString(), + GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer.OPENAI_CHAT_MODEL.toString(), Summarizer::getExperimentMessage, Summarizer::getFinalSummaryMessage); // fetch summary, producing if necessary and requested diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java index 8c65b7f33..eae3a6dfc 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -41,7 +41,7 @@ public class AiExpressionCache { private static final String DEFAULT_TMP_CACHE_SUBDIR = "expressionCache"; // catch characteristics - private static final long DEFAULT_TIMEOUT_MILLIS = 5000; + private static final long DEFAULT_TIMEOUT_MILLIS = 5 * 60 * 1000; private static final long DEFAULT_POLL_FREQUENCY_MILLIS = 500; // cache filenames From a5799699455d03e62b007e07c328f16b8cd96730 Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Mon, 24 Feb 2025 11:47:31 -0500 Subject: [PATCH 18/31] getGeneId() fix --- .../model/report/ai/expression/GeneRecordProcessor.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index c82dede69..6c5349953 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -51,7 +51,7 @@ public interface GeneSummaryInputs { } private static String getGeneId(RecordInstance record) { - return record.getPrimaryKey().getValues().get("gene_source_id"); + return record.getPrimaryKey().getValues().get("source_id"); } public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, String aiChatModel, Function getExperimentPrompt, Function, String> getFinalSummaryPrompt) throws WdkModelException { From 6693be51476c9883ebdb4451c16bacc5a670745c Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Mon, 24 Feb 2025 12:42:44 -0500 Subject: [PATCH 19/31] bugfix --- .../model/report/ai/SingleGeneAiExpressionReporter.java | 1 + 1 file changed, 1 insertion(+) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java index 76a1d83e4..e840a8850 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java @@ -97,6 +97,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException { writer.write("\"" + summaryInputs.getGeneId() + "\":" + expressionSummary.toString()); } + writer.write("}"); } } } From 4f52d3796e6fb0d4e4d3b09edf30d3a42a8b7898 Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Tue, 25 Feb 2025 06:24:37 -0500 Subject: [PATCH 20/31] reworked summary prompt to avoid generalities and for clarity --- .../apicommon/model/report/ai/expression/Summarizer.java | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index d4ff58fe4..cd0cf1dd1 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -146,10 +146,13 @@ public CompletableFuture describeExperiment(ExperimentInputs experim public static String getFinalSummaryMessage(List experiments) { - return "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" + + return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" + String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) + - "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `` tags and use clear, scientific language accessible to non-native English speakers throughout your response."; - + "Generate a one-paragraph summary (~100 words) describing the gene's expression. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\".\n\n" + + "Additionally, organize the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. For each section, provide:\n" + + "- A headline summarizing the section's key findings\n" + + "- A concise one-sentence summary of the experimental results\n\n" + + "These sections will be displayed to users. Wrap species names in `` tags and use clear, precise scientific language accessible to non-native English speakers."; } public JSONObject summarizeExperiments(List experiments) { From 6a5a078158e57da6fb7349fde7e6ba42097f6f9d Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Tue, 25 Feb 2025 14:59:03 -0500 Subject: [PATCH 21/31] prompt for structured summary paragraph --- .../apicommon/model/report/ai/expression/Summarizer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index cd0cf1dd1..f0ccf1865 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -148,11 +148,11 @@ public static String getFinalSummaryMessage(List experiments) { return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" + String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) + - "Generate a one-paragraph summary (~100 words) describing the gene's expression. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\".\n\n" + + "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using ,
    , and
  • tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" + "Additionally, organize the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. For each section, provide:\n" + "- A headline summarizing the section's key findings\n" + "- A concise one-sentence summary of the experimental results\n\n" + - "These sections will be displayed to users. Wrap species names in `` tags and use clear, precise scientific language accessible to non-native English speakers."; + "These sections will be displayed to users. In all generated text, wrap species names in `` tags and use clear, precise scientific language accessible to non-native English speakers."; } public JSONObject summarizeExperiments(List experiments) { From f35a43e29524ae04bfcb18a0cc030084519dc76b Mon Sep 17 00:00:00 2001 From: Ryan Doherty Date: Fri, 28 Feb 2025 03:25:45 -0500 Subject: [PATCH 22/31] Remove openai version (now in base pom) --- Model/pom.xml | 2 -- 1 file changed, 2 deletions(-) diff --git a/Model/pom.xml b/Model/pom.xml index 0d1bbe5dd..76e98617a 100644 --- a/Model/pom.xml +++ b/Model/pom.xml @@ -135,11 +135,9 @@ Jackfish - com.openai openai-java - 0.22.0 From 6dd538058b03ec7bf8f8b084baaeb5d1b9d32028 Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 28 Feb 2025 11:00:01 +0000 Subject: [PATCH 23/31] pretty print JSON sent to the model --- .../apicommon/model/report/ai/expression/Summarizer.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index f0ccf1865..ecef898ca 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -101,7 +101,7 @@ public static String getExperimentMessage(JSONObject experiment) { // We don't need to send dataset_id to the AI but it's useful to have it // in the response for phase two - JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone + JSONObject experimentForAI = new JSONObject(experiment.toString(2)); // clone experimentForAI.remove("dataset_id"); return @@ -147,7 +147,7 @@ public CompletableFuture describeExperiment(ExperimentInputs experim public static String getFinalSummaryMessage(List experiments) { return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" + - String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) + + String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) + "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using ,
      , and
    • tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" + "Additionally, organize the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. For each section, provide:\n" + "- A headline summarizing the section's key findings\n" + From 6607b0d03c21174ef7b4a314388d953b52bfc5b6 Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 28 Feb 2025 11:07:07 +0000 Subject: [PATCH 24/31] sections renamed to topics --- .../report/ai/expression/Summarizer.java | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index ecef898ca..fcfac1d52 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -56,7 +56,7 @@ public class Summarizer { .putAdditionalProperty("properties", JsonValue.from(Map.of( "headline", Map.of("type", "string"), "one_paragraph_summary", Map.of("type", "string"), - "sections", Map.of("type", "array", "minimum", 1, "items", Map.of( + "topics", Map.of("type", "array", "minimum", 1, "items", Map.of( "type", "object", "required", List.of("headline", "one_sentence_summary", "dataset_ids"), "properties", Map.of( @@ -149,10 +149,10 @@ public static String getFinalSummaryMessage(List experiments) { return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" + String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) + "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using ,
        , and
      • tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" + - "Additionally, organize the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. For each section, provide:\n" + - "- A headline summarizing the section's key findings\n" + - "- A concise one-sentence summary of the experimental results\n\n" + - "These sections will be displayed to users. In all generated text, wrap species names in `` tags and use clear, precise scientific language accessible to non-native English speakers."; + "Additionally, group the per-experiment summaries (identified by `dataset_id`) with `biological_importance > 3` and `confidence > 3` into sections by topic. For each topic, provide:\n" + + "- A headline summarizing the key experimental results within the topic\n" + + "- A concise one-sentence summary of the topic's experimental results\n\n" + + "These topics will be displayed to users. In all generated text, wrap species names in `` tags and use clear, precise scientific language accessible to non-native English speakers." } public JSONObject summarizeExperiments(List experiments) { @@ -197,12 +197,12 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse, } Set seenDatasetIds = new HashSet<>(); - JSONArray deduplicatedSections = new JSONArray(); - JSONArray sections = summaryResponse.getJSONArray("sections"); + JSONArray deduplicatedTopics = new JSONArray(); + JSONArray topics = summaryResponse.getJSONArray("topics"); - for (int i = 0; i < sections.length(); i++) { - JSONObject section = sections.getJSONObject(i); - JSONArray datasetIds = section.getJSONArray("dataset_ids"); + for (int i = 0; i < topics.length(); i++) { + JSONObject topic = topics.getJSONObject(i); + JSONArray datasetIds = topic.getJSONArray("dataset_ids"); JSONArray summaries = new JSONArray(); for (int j = 0; j < datasetIds.length(); j++) { @@ -211,7 +211,7 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse, // Warn and skip if the id doesn't exist if (!datasetSummaries.containsKey(id)) { System.out.println( - "WARNING: summary section id '" + id + "' does not exist. Excluding from final output."); + "WARNING: dataset_id '" + id + "' does not exist. Excluding from final output."); continue; } // Skip if we've seen it @@ -222,34 +222,34 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse, summaries.put(datasetSummaries.get(id)); } - // Update section with mapped summaries and remove dataset_ids key - section.put("summaries", summaries); - section.remove("dataset_ids"); - deduplicatedSections.put(section); + // Update topic with mapped summaries and remove dataset_ids key + topic.put("summaries", summaries); + topic.remove("dataset_ids"); + deduplicatedTopics.put(topic); } // Find missing dataset IDs Set missingDatasetIds = new HashSet<>(datasetSummaries.keySet()); missingDatasetIds.removeAll(seenDatasetIds); - // If there are missing IDs, add an "Others" section + // If there are missing IDs, add an "Others" topic if (!missingDatasetIds.isEmpty()) { JSONArray otherSummaries = new JSONArray(); for (String id : missingDatasetIds) { otherSummaries.put(datasetSummaries.get(id)); } - JSONObject otherSection = new JSONObject(); - otherSection.put("headline", "Other"); - otherSection.put("one_sentence_summary", - "These experiments were not grouped into sub-sections by the AI."); - otherSection.put("summaries", otherSummaries); - deduplicatedSections.put(otherSection); + JSONObject otherTopic = new JSONObject(); + otherTopic.put("headline", "Other"); + otherTopic.put("one_sentence_summary", + "These experiments were not grouped by the AI."); + otherTopic.put("summaries", otherSummaries); + deduplicatedTopics.put(otherTopic); } // Create final deduplicated summary JSONObject finalSummary = new JSONObject(summaryResponse.toString()); - finalSummary.put("sections", deduplicatedSections); + finalSummary.put("topics", deduplicatedTopics); return finalSummary; } From 3d3625e409f4a89c0953971d9c2bc5661946abb8 Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 28 Feb 2025 11:24:17 +0000 Subject: [PATCH 25/31] add assay_type and experiment_name to first phase outputs to aid second phase --- .../ai/expression/GeneRecordProcessor.java | 16 ++++++++++++++++ .../model/report/ai/expression/Summarizer.java | 3 +++ 2 files changed, 19 insertions(+) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index 6c5349953..d9317e587 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -36,6 +36,10 @@ public interface ExperimentInputs { String getDatasetId(); + String getAssayType(); + + String getExperimentName(); + String getDigest(); JSONObject getExperimentData(); @@ -107,6 +111,8 @@ private static List processExpressionData(RecordInstance recor } String datasetId = experimentRow.getAttributeValue("dataset_id").getValue(); + String assayType = experimentRow.getAttributeValue("assay_type").getValue(); + String experimentName = experimentRow.getAttributeValue("display_name").getValue(); List filteredData = readFilteredData(datasetId, expressionGraphsDataTable); @@ -118,6 +124,16 @@ private static List processExpressionData(RecordInstance recor public String getDatasetId() { return datasetId; } + + @Override + public String getAssayType() { + return assayType; + } + + @Override + public String getExperimentName() { + return experimentName; + } @Override public String getCacheKey() { diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index fcfac1d52..d7d86cd54 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -133,7 +133,10 @@ public CompletableFuture describeExperiment(ExperimentInputs experim String jsonString = completion.choices().get(0).message().content().get(); try { JSONObject jsonObject = new JSONObject(jsonString); + // add some fields directly to aid the final summarization jsonObject.put("dataset_id", experimentInputs.getDatasetId()); + jsonObject.put("assay_type", experimentInputs.getAssayType()); + jsonObject.put("experiment_name", experimentInputs.getExperimentName()); return jsonObject; } catch (JSONException e) { From e85ad1f2fe50fbab64d69b8b74eca4fea97193c4 Mon Sep 17 00:00:00 2001 From: Bob Date: Fri, 28 Feb 2025 11:52:41 +0000 Subject: [PATCH 26/31] sort second level inputs and add DATA_MODEL_VERSION for better cache control --- .../report/ai/expression/GeneRecordProcessor.java | 8 ++++++-- .../model/report/ai/expression/Summarizer.java | 10 ++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index d9317e587..c4dc7c002 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -30,6 +30,10 @@ public class GeneRecordProcessor { public static final List REQUIRED_TABLE_NAMES = List.of(EXPRESSION_GRAPH_TABLE, EXPRESSION_GRAPH_DATA_TABLE); + // Increment this to invalidate all previous cache entries: + // (for example if changing first level model outputs rather than inputs which are already digestified) + private static final String DATA_MODEL_VERSION = "v2"; + public interface ExperimentInputs { String getCacheKey(); @@ -86,7 +90,7 @@ public String getDigest() { List digests = experimentsWithData.stream() .map(exp -> new JSONObject().put("digest", exp.getDigest())) .collect(Collectors.toList()); - return EncryptionUtil.md5(aiChatModel + " " + getFinalSummaryPrompt.apply(digests)); + return EncryptionUtil.md5(aiChatModel + ":" + DATA_MODEL_VERSION + ":" + getFinalSummaryPrompt.apply(digests)); } }; @@ -142,7 +146,7 @@ public String getCacheKey() { @Override public String getDigest() { - return EncryptionUtil.md5(aiChatModel + " " + getExperimentPrompt.apply(getExperimentData())); + return EncryptionUtil.md5(aiChatModel + ":" + DATA_MODEL_VERSION + ":" + getExperimentPrompt.apply(getExperimentData())); } @Override diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index d7d86cd54..aa6402727 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -148,9 +148,15 @@ public CompletableFuture describeExperiment(ExperimentInputs experim } public static String getFinalSummaryMessage(List experiments) { - + + List sortedExperiments = + experiments.sort( + Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder()) + .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder()) + ); + return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" + - String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) + + String.format("```json\n%s\n```\n\n", new JSONArray(sortedExperiments).toString(2)) + "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using ,
          , and
        • tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" + "Additionally, group the per-experiment summaries (identified by `dataset_id`) with `biological_importance > 3` and `confidence > 3` into sections by topic. For each topic, provide:\n" + "- A headline summarizing the key experimental results within the topic\n" + From dada1a5b04e456d4e0232501f9517a1f9f04ff8f Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Fri, 28 Feb 2025 07:24:40 -0500 Subject: [PATCH 27/31] increase concurrency and fix bugs --- .../report/ai/expression/AiExpressionCache.java | 2 +- .../model/report/ai/expression/Summarizer.java | 14 +++++++------- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java index eae3a6dfc..3ea0a6dc1 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -34,7 +34,7 @@ public class AiExpressionCache { private static Logger LOG = Logger.getLogger(AiExpressionCache.class); // parallel processing - private static final int MAX_CONCURRENT_EXPERIMENT_LOOKUPS_PER_REQUEST = 5; + private static final int MAX_CONCURRENT_EXPERIMENT_LOOKUPS_PER_REQUEST = 10; // cache location private static final String CACHE_DIR_PROP_NAME = "AI_EXPRESSION_CACHE_DIR"; diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index aa6402727..23860cb59 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -6,6 +6,7 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.CompletableFuture; +import java.util.Comparator; import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.ExperimentInputs; import org.gusdb.fgputil.json.JsonUtil; @@ -149,19 +150,18 @@ public CompletableFuture describeExperiment(ExperimentInputs experim public static String getFinalSummaryMessage(List experiments) { - List sortedExperiments = - experiments.sort( - Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder()) - .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder()) - ); + experiments.sort( + Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder()) + .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder()) + ); return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" + - String.format("```json\n%s\n```\n\n", new JSONArray(sortedExperiments).toString(2)) + + String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) + "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using ,
            , and
          • tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" + "Additionally, group the per-experiment summaries (identified by `dataset_id`) with `biological_importance > 3` and `confidence > 3` into sections by topic. For each topic, provide:\n" + "- A headline summarizing the key experimental results within the topic\n" + "- A concise one-sentence summary of the topic's experimental results\n\n" + - "These topics will be displayed to users. In all generated text, wrap species names in `` tags and use clear, precise scientific language accessible to non-native English speakers." + "These topics will be displayed to users. In all generated text, wrap species names in `` tags and use clear, precise scientific language accessible to non-native English speakers."; } public JSONObject summarizeExperiments(List experiments) { From 5c20b3238bb1cf73d5cc28d704a82c185a794e38 Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Fri, 28 Feb 2025 08:10:59 -0500 Subject: [PATCH 28/31] apply experiment summary reporting in proper place --- .../model/report/ai/expression/AiExpressionCache.java | 8 ++++++++ .../apicommon/model/report/ai/expression/Summarizer.java | 6 ------ 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java index 3ea0a6dc1..c7f108fb8 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -10,6 +10,7 @@ import java.util.ArrayList; import java.util.List; import java.util.Optional; +import java.util.Comparator; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -225,6 +226,13 @@ public JSONObject populateSummary(GeneSummaryInputs summaryInputs, // first populate each dataset entry as needed and collect experiment descriptors List experiments = populateExperiments(summaryInputs.getExperimentsWithData(), experimentDescriber); + // sort them most-interesting first so that the "Other" section will be filled + // in that order (and also to give the AI the data in a sensible order) + experiments.sort( + Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder()) + .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder()) + ); + // summarize experiments and store getPopulator(summaryInputs.getDigest(), () -> experimentSummarizer.apply(experiments)).accept(entryDir); }, diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 23860cb59..daa5e3d39 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -6,7 +6,6 @@ import java.util.Map; import java.util.Set; import java.util.concurrent.CompletableFuture; -import java.util.Comparator; import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.ExperimentInputs; import org.gusdb.fgputil.json.JsonUtil; @@ -150,11 +149,6 @@ public CompletableFuture describeExperiment(ExperimentInputs experim public static String getFinalSummaryMessage(List experiments) { - experiments.sort( - Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder()) - .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder()) - ); - return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" + String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) + "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using ,
              , and
            • tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" + From 5bd73448b7f2127f6630c076a90327ee6cf31554 Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Fri, 28 Feb 2025 08:17:11 -0500 Subject: [PATCH 29/31] Other topic section wording improved --- .../apidb/apicommon/model/report/ai/expression/Summarizer.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index daa5e3d39..359212bc4 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -245,7 +245,7 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse, JSONObject otherTopic = new JSONObject(); otherTopic.put("headline", "Other"); otherTopic.put("one_sentence_summary", - "These experiments were not grouped by the AI."); + "The AI ordered these experiments by biological importance but did not group them into topics."); otherTopic.put("summaries", otherSummaries); deduplicatedTopics.put(otherTopic); } From ac11cafbfa146715e302953a0bf890ecd3cbfd05 Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Fri, 28 Feb 2025 09:05:34 -0500 Subject: [PATCH 30/31] banish empty topics --- .../model/report/ai/expression/GeneRecordProcessor.java | 2 +- .../apicommon/model/report/ai/expression/Summarizer.java | 9 ++++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index c4dc7c002..c18080da3 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -32,7 +32,7 @@ public class GeneRecordProcessor { // Increment this to invalidate all previous cache entries: // (for example if changing first level model outputs rather than inputs which are already digestified) - private static final String DATA_MODEL_VERSION = "v2"; + private static final String DATA_MODEL_VERSION = "v3"; public interface ExperimentInputs { diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index 359212bc4..e11957295 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -226,9 +226,12 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse, } // Update topic with mapped summaries and remove dataset_ids key - topic.put("summaries", summaries); - topic.remove("dataset_ids"); - deduplicatedTopics.put(topic); + // but only if it's a non-empty topic (can happen with bad dataset_ids, see above) + if (summaries.length() > 0) { + topic.put("summaries", summaries); + topic.remove("dataset_ids"); + deduplicatedTopics.put(topic); + } } // Find missing dataset IDs From 27fa586b803658101aea47d1e1a806e9baa54537 Mon Sep 17 00:00:00 2001 From: Bob MacCallum Date: Fri, 28 Feb 2025 11:57:30 -0500 Subject: [PATCH 31/31] preserve sort order during consolidation step --- .../report/ai/expression/AiExpressionCache.java | 1 + .../report/ai/expression/GeneRecordProcessor.java | 2 +- .../model/report/ai/expression/Summarizer.java | 15 ++++++++------- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java index c7f108fb8..688a8dcb3 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java @@ -11,6 +11,7 @@ import java.util.List; import java.util.Optional; import java.util.Comparator; +import java.util.stream.Collectors; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java index c18080da3..26fa39bab 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java @@ -32,7 +32,7 @@ public class GeneRecordProcessor { // Increment this to invalidate all previous cache entries: // (for example if changing first level model outputs rather than inputs which are already digestified) - private static final String DATA_MODEL_VERSION = "v3"; + private static final String DATA_MODEL_VERSION = "v3b"; public interface ExperimentInputs { diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java index e11957295..7d4e7c009 100644 --- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java +++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java @@ -1,7 +1,7 @@ package org.apidb.apicommon.model.report.ai.expression; -import java.util.HashMap; -import java.util.HashSet; +import java.util.LinkedHashMap; +import java.util.LinkedHashSet; import java.util.List; import java.util.Map; import java.util.Set; @@ -193,13 +193,14 @@ public JSONObject summarizeExperiments(List experiments) { private static JSONObject consolidateSummary(JSONObject summaryResponse, List individualResults) { - // Gather all dataset IDs from individualResults and map them to summaries - Map datasetSummaries = new HashMap<>(); + // Gather all dataset IDs from individualResults and map them to summaries. + // Preserving the order of individualResults. + Map datasetSummaries = new LinkedHashMap<>(); for (JSONObject result : individualResults) { datasetSummaries.put(result.getString("dataset_id"), result); } - Set seenDatasetIds = new HashSet<>(); + Set seenDatasetIds = new LinkedHashSet<>(); JSONArray deduplicatedTopics = new JSONArray(); JSONArray topics = summaryResponse.getJSONArray("topics"); @@ -234,8 +235,8 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse, } } - // Find missing dataset IDs - Set missingDatasetIds = new HashSet<>(datasetSummaries.keySet()); + // Find missing dataset IDs (preserve dataset order) + Set missingDatasetIds = new LinkedHashSet<>(datasetSummaries.keySet()); missingDatasetIds.removeAll(seenDatasetIds); // If there are missing IDs, add an "Others" topic