From 8d16493c3c4ec0e116c99c90a9b672e032f53984 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Wed, 12 Feb 2025 22:03:38 +0000
Subject: [PATCH 01/31] WIP

---
 .../ai/SingleGeneAiExpressionReporter.java    |  12 +
 .../ai/expression/ExperimentProcessor.java    |  75 ++++
 .../report/ai/expression/ExpressionData.java  |  22 ++
 .../report/ai/expression/Summarizer.java      | 320 ++++++++++++++++++
 4 files changed, 429 insertions(+)
 create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
 create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java
 create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java
 create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
new file mode 100644
index 000000000..aa811cf39
--- /dev/null
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -0,0 +1,12 @@
+package org.apidb.apicommon.model.report.ai;
+
+import org.gusdb.wdk.model.report.AbstractReporter;
+import org.apidb.apicommon.model.report.ai.expression.Summarizer;
+
+public class SingleGeneAiExpressionReporter extends AbstractReporter {
+
+    // configure: is any config needed?
+
+    // write: does the business - see SingleGeneReporter for example
+}
+
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java
new file mode 100644
index 000000000..2ba4b6fc6
--- /dev/null
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java
@@ -0,0 +1,75 @@
+package org.apidb.apicommon.model.report.ai.expression;
+
+import org.json.JSONArray;
+import org.json.JSONObject;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+
+public class ExperimentProcessor {
+    private static final Set<String> KEYS_TO_KEEP = Set.of(
+	"y_axis", "description", "genus_species", "project_id", "summary", "dataset_id",
+        "assay_type", "x_axis", "module", "dataset_name", "display_name", "short_attribution", "paralog_number"
+    );
+
+
+    public static List<JSONObject> processExpressionData(ExpressionData expressionData) {
+	return processExpressionData(expressionData, 0);
+    }
+
+    // for debugging only
+    public static List<JSONObject> processExpressionData(ExpressionData expressionData, String datasetId) {
+        List<JSONObject> experiments = processExpressionData(expressionData, 0);
+        return experiments.stream()
+                .filter(experiment -> datasetId.equals(experiment.getString("dataset_id")))
+                .collect(Collectors.toList());
+    }
+
+    // maxExperiments is for dev/debugging only
+    public static List<JSONObject> processExpressionData(ExpressionData expressionData, int maxExperiments) {
+        List<JSONObject> experiments = new ArrayList<>();
+
+        for (JSONObject expressionGraph : expressionData.getExpressionGraphs()) {
+            String datasetId = expressionGraph.getString("dataset_id");
+
+            // Extract only relevant keys from expressionGraph
+            JSONObject experimentInfo = new JSONObject();
+            for (String key : KEYS_TO_KEEP) {
+                if (expressionGraph.has(key)) {
+                    experimentInfo.put(key, expressionGraph.get(key));
+                }
+            }
+
+            // Filter expressionGraphsDataTable to match dataset_id
+            List<JSONObject> filteredData = new ArrayList<>();
+            for (JSONObject entry : expressionData.getExpressionGraphsDataTable()) {
+                if (datasetId.equals(entry.getString("dataset_id"))) {
+                    JSONObject dataEntry = new JSONObject();
+                    dataEntry.put("sample_name", entry.getString("sample_name"));
+                    dataEntry.put("value", entry.get("value"));
+                    if (entry.has("standard_error")) {
+                        dataEntry.put("standard_error", entry.get("standard_error"));
+                    }
+                    if (entry.has("percentile_channel1")) {
+                        dataEntry.put("percentile_channel1", entry.get("percentile_channel1"));
+                    }
+                    if (entry.has("percentile_channel2")) {
+                        dataEntry.put("percentile_channel2", entry.get("percentile_channel2"));
+                    }
+                    filteredData.add(dataEntry);
+                }
+            }
+
+            // Combine and store experiment data
+	    experimentInfo.put("data", filteredData);
+	    experiments.add(experimentInfo);
+	    
+	    if (maxExperiments > 0 && experiments.size() >= maxExperiments) {
+		break;
+	    }
+        }
+
+        return experiments;
+    }
+}
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java
new file mode 100644
index 000000000..c2e688878
--- /dev/null
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java
@@ -0,0 +1,22 @@
+package org.apidb.apicommon.model.report.ai.expression;
+
+import org.json.JSONObject;
+import java.util.List;
+
+public class ExpressionData {
+    private final List<JSONObject> expressionGraphs;
+    private final List<JSONObject> expressionGraphsDataTable;
+
+    public ExpressionData(List<JSONObject> expressionGraphs, List<JSONObject> expressionGraphsDataTable) {
+        this.expressionGraphs = expressionGraphs;
+        this.expressionGraphsDataTable = expressionGraphsDataTable;
+    }
+
+    public List<JSONObject> getExpressionGraphs() {
+        return expressionGraphs;
+    }
+
+    public List<JSONObject> getExpressionGraphsDataTable() {
+        return expressionGraphsDataTable;
+    }
+}
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
new file mode 100644
index 000000000..65117fc60
--- /dev/null
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -0,0 +1,320 @@
+package org.apidb.apicommon.model.report.ai.expression;
+
+//
+// TO DO - add deps to pom.xml
+//
+
+import org.json.JSONObject;
+import org.json.JSONArray;
+import org.json.JSONException;
+
+import com.openai.client.OpenAIClientAsync;
+import com.openai.client.okhttp.OpenAIOkHttpClientAsync;
+import com.openai.models.ChatCompletionCreateParams;
+import com.openai.models.ChatModel;
+import com.openai.models.ChatCompletion;
+import com.openai.models.ResponseFormatJsonSchema;
+import com.openai.models.ResponseFormatJsonSchema.JsonSchema;
+import com.openai.core.JsonValue;
+import java.util.List;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.HashSet;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Collectors;
+
+public class Summarizer {
+    private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder()
+            .fromEnv()  // Uses OPENAI_API_KEY from env
+            .maxRetries(32)  // Handle 429 errors
+            .build();
+
+    // provide exact model number for semi-reproducibility
+    private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
+    private static int MAX_RESPONSE_TOKENS = 5000;
+    
+    private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data";
+
+    // Prepare JSON schemas for structured responses
+    private static final JsonSchema.Schema experimentResponseSchema =
+	JsonSchema.Schema.builder()
+	.putAdditionalProperty("type", JsonValue.from("object"))
+	.putAdditionalProperty("properties",
+			       JsonValue
+			       .from(Map
+				     .of(
+					 "one_sentence_summary", Map.of("type", "string"),
+					 "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5),
+					 "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5),
+					 "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), 
+					 "notes", Map.of("type", "string")
+					 )
+				     )
+			       )
+	.putAdditionalProperty("required",
+			       JsonValue.from(
+					      List.of(
+						      "one_sentence_summary",
+						      "biological_importance",
+						      "confidence",
+						      "experiment_keywords",
+						      "notes")
+					      )
+			       )
+	.build();
+
+     private static final JsonSchema.Schema finalResponseSchema =
+	JsonSchema.Schema.builder()
+	.putAdditionalProperty("type", JsonValue.from("object"))
+	.putAdditionalProperty("properties",
+			       JsonValue
+			       .from(Map
+				     .of(
+					 "headline", Map.of("type", "string"),
+					 "one_paragraph_summary", Map.of("type", "string"),
+					 "sections",
+					 Map.of("type", "array",
+						"minimum", 1,
+						"items",
+						Map.of(
+						       "type", "object",
+						       "required", List.of("headline", "one_sentence_summary", "dataset_ids"),
+						       "properties",
+						       Map.of(
+							      "headline", Map.of("type", "string"),
+							      "one_sentence_summary", Map.of("type", "string"),
+							      "dataset_ids", Map.of("type", "array",
+										    "items", Map.of("type", "string"))
+							      )
+						       )
+						)
+					 )
+				     )
+			       )
+	.putAdditionalProperty("required",
+			       JsonValue.from(
+					      List.of(
+						      "headline",
+						      "one_paragraph_summary",
+						      "dataset_ids"
+						      )
+					      )
+			       )
+	.build();
+   
+    public static JSONObject summariseExpression(String geneId, String projectId, String serviceBaseUrl) {
+        System.out.println("Summarising expression for Gene ID: " + geneId + " with model: " + model.toString());
+        
+        // Placeholder for the actual implementation
+        System.out.println("Fetching data from: " + serviceBaseUrl);
+
+	try {
+            // Call the API client to fetch expression data
+            ExpressionData expressionData = WdkClient.fetchExpressionData(serviceBaseUrl, geneId, projectId);
+            
+            // Print retrieved data (debugging)
+            System.out.println("Expression Graphs: " + expressionData.getExpressionGraphs().size());
+            System.out.println("Expression Graphs Data Table: " + expressionData.getExpressionGraphsDataTable().size());
+
+            // Process expression data further into a list of pruned metadata plus data
+	    List<JSONObject> experimentsWithData = ExperimentProcessor.processExpressionData(expressionData);
+            System.out.println("Pre-processed Experiments: " + experimentsWithData.size());
+	    
+            // Send AI requests in parallel
+	    // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI
+            List<CompletableFuture<JSONObject>> aiRequests = experimentsWithData.stream()
+                .map(Summarizer::sendExperimentToOpenAI)
+                .collect(Collectors.toList());
+            // Wait for all requests to complete
+            List<JSONObject> responses = aiRequests.stream()
+                .map(CompletableFuture::join)  // Blocks until each completes
+                .collect(Collectors.toList());
+
+            // Debug output
+	    // System.out.println("Individual responses:");
+            // responses.forEach(response -> System.out.println(response.toString(2)));
+	    // System.exit(0);
+	    
+	    JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses);
+	    return finalSummary;
+
+        } catch (Exception e) {
+            // Handle errors gracefully
+            System.err.println("Error fetching expression data: " + e.getMessage());
+            e.printStackTrace(); // Print stack trace for debugging
+        }
+	
+	return null;
+    }
+
+    private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject experiment) {
+
+	// Possible TO DO: AI EDIT DESCRIPTION
+	// Before sending the experiment+data to the AI, ask the AI to edit the `description` field
+	// as follows: (This should be cached by dataset_id only and would be called once per organism
+	// and would reduce tokens and "cognitive load" a little bit for the next step.)
+	//
+	// "Edit the following text to so that it **only** describes the experimental design of the transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, especially not any genes or groups of genes and their expression behaviour."
+	//
+	// We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below.
+
+
+	
+	// We don't need to send the dataset_id to the AI but it's useful to have in the response for phase two
+	JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
+	String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null;
+	experimentForAI.remove("dataset_id");
+
+// specific experimental fixes for "DS_2e639b71f6"
+//	experimentForAI.put("display_name", "Transcriptional profiling of male head comparing swarming mosquito with control non-swarming mosquito (Anopheles coluzzii, aligned to A. gambiae PEST strain)");
+//	experimentForAI.put("y_axis", "Expression Values for 2 channel microarray experiments are log ratios.");
+//	experimentForAI.put("description", "Anopheles coluzzii mosquitoes were collected in July in Vallée du Kou, Bobo-Dioulasso, Burkina Faso in 2011. Mosquitoes, mostly males, were collected in swarms using sweeping net during dusk. The indoor resting (nonswarming) males with antennal fibrillae becoming erect were collected in inhabited houses using vacuum aspiration just prior to swarming time. The collected mosquitoes were placed in tubes containing RNAlater to prevent RNA degradation. Mosquito species was molecularly identified by SINE-PCR. Total RNA from 50 male mosquito heads was isolated. Both swarm male heads and indoor resting male heads were used as samples. Laboratory reared 2-6-day old virgin An. gambiae s.s. male heads were used as reference samples (control). Three biological replicates were performed for each group.    \nMicroarray analysis: Cy5- and Cy3-labeled cRNA probes were generated from 200 ng of RNA using Agilent Technologies Low Input Quick Amp Labeling Kit according to the manufacturer's instructions. Probe hybridization to the microarray slides was performed with 2 μg cRNA probes.   ");
+	
+	String message = """
+The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:
+
+```json
+%s
+```
+
+**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.
+**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.
+**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields.
+""".formatted(experimentForAI.toString());
+
+//   System.out.println(message); /// DEBUG
+
+        ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
+                .model(model)
+                .maxCompletionTokens(MAX_RESPONSE_TOKENS)
+                .responseFormat(ResponseFormatJsonSchema.builder()
+                        .jsonSchema(JsonSchema.builder()
+                                .name("experiment-summary")
+                                .schema(experimentResponseSchema)
+                                .build())
+                        .build())
+                .addSystemMessage(systemMessage)
+                .addUserMessage(message)
+	    // .temperature(1.0)
+                .build();
+
+	// add dataset_id back to the response
+        return openAIClient.chat().completions().create(request)
+	    .thenApply(completion -> {
+		    // response is a JSON string
+		    String jsonString = completion.choices().get(0).message().content().get();
+		    try {
+			JSONObject jsonObject = new JSONObject(jsonString);
+			jsonObject.put("dataset_id", datasetId);
+			return jsonObject;
+		    } catch (JSONException e) {
+			System.err.println("Error parsing JSON response for dataset " + datasetId + ": " + e.getMessage());
+			System.err.println("Raw response: " + jsonString);
+			return new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId);
+		    }
+                });
+    }
+
+
+    private static JSONObject sendExperimentSummariesToOpenAI(List<JSONObject> experiments) {
+	
+	String message = """
+Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:
+
+```json
+%s
+```
+
+Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.
+""".formatted(new JSONArray(experiments));
+
+        ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
+                .model(model)
+                .maxCompletionTokens(MAX_RESPONSE_TOKENS)
+                .responseFormat(ResponseFormatJsonSchema.builder()
+                        .jsonSchema(JsonSchema.builder()
+                                .name("expression-summary")
+                                .schema(finalResponseSchema)
+                                .build())
+                        .build())
+                .addSystemMessage(systemMessage)
+                .addUserMessage(message)
+                .build();
+
+	// System.out.println(message);
+
+	ChatCompletion completion = openAIClient.chat().completions().create(request).join(); // join() waits for the async response
+        String jsonString = completion.choices().get(0).message().content().get();
+	JSONObject rawResponseObject = new JSONObject(jsonString);
+
+	// TO DO - quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI
+	JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments);
+	
+	return finalResponseObject;
+    }
+
+    
+    public static JSONObject consolidateSummary(JSONObject summaryResponse, List<JSONObject> individualResults) {
+        // Gather all dataset IDs from individualResults and map them to summaries
+        Map<String, JSONObject> datasetSummaries = new HashMap<>();
+        for (JSONObject result : individualResults) {
+            datasetSummaries.put(result.getString("dataset_id"), result);
+        }
+
+        Set<String> seenDatasetIds = new HashSet<>();
+        JSONArray deduplicatedSections = new JSONArray();
+        JSONArray sections = summaryResponse.getJSONArray("sections");
+
+        for (int i = 0; i < sections.length(); i++) {
+            JSONObject section = sections.getJSONObject(i);
+            JSONArray datasetIds = section.getJSONArray("dataset_ids");
+            JSONArray summaries = new JSONArray();
+
+            for (int j = 0; j < datasetIds.length(); j++) {
+                String id = datasetIds.getString(j);
+                
+                // Warn and skip if the id doesn't exist
+                if (!datasetSummaries.containsKey(id)) {
+                    System.out.println("WARNING: summary section id '" + id + "' does not exist. Excluding from final output.");
+                    continue;
+                }
+                // Skip if we've seen it
+                if (seenDatasetIds.contains(id)) continue;
+                
+                seenDatasetIds.add(id);
+                summaries.put(datasetSummaries.get(id));
+            }
+            
+            // Update section with mapped summaries and remove dataset_ids key
+            section.put("summaries", summaries);
+            section.remove("dataset_ids");
+            deduplicatedSections.put(section);
+        }
+
+        // Find missing dataset IDs
+        Set<String> missingDatasetIds = new HashSet<>(datasetSummaries.keySet());
+        missingDatasetIds.removeAll(seenDatasetIds);
+
+        // If there are missing IDs, add an "Others" section
+        if (!missingDatasetIds.isEmpty()) {
+            JSONArray otherSummaries = new JSONArray();
+            for (String id : missingDatasetIds) {
+                otherSummaries.put(datasetSummaries.get(id));
+            }
+            
+            JSONObject otherSection = new JSONObject();
+            otherSection.put("headline", "Other");
+            otherSection.put("one_sentence_summary", "These experiments were not grouped into sub-sections by the AI.");
+            otherSection.put("summaries", otherSummaries);
+            deduplicatedSections.put(otherSection);
+        }
+
+        // Create final deduplicated summary
+        JSONObject finalSummary = new JSONObject(summaryResponse.toString());
+        finalSummary.put("sections", deduplicatedSections);
+        return finalSummary;
+    }
+
+}
+

From f85f2a03173b9a1997db2148351335d3a48874d1 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Thu, 13 Feb 2025 20:47:07 +0000
Subject: [PATCH 02/31] it compiles - at least

---
 Model/pom.xml                                 |  8 +++
 .../ai/SingleGeneAiExpressionReporter.java    | 49 +++++++++++++++++--
 .../report/ai/expression/Summarizer.java      | 45 +++++------------
 3 files changed, 65 insertions(+), 37 deletions(-)

diff --git a/Model/pom.xml b/Model/pom.xml
index c216d7981..0d1bbe5dd 100644
--- a/Model/pom.xml
+++ b/Model/pom.xml
@@ -134,6 +134,14 @@
       <groupId>io.vulpine.lib</groupId>
       <artifactId>Jackfish</artifactId>
     </dependency>
+
+    <!-- TO DO - when stable, move version to base-pom/pom.xml -->
+    <dependency>
+      <groupId>com.openai</groupId>
+      <artifactId>openai-java</artifactId>
+      <version>0.22.0</version>
+    </dependency>
+
   </dependencies>
 
 </project>
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index aa811cf39..06e18d3d3 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -1,12 +1,55 @@
 package org.apidb.apicommon.model.report.ai;
 
 import org.gusdb.wdk.model.report.AbstractReporter;
+import org.gusdb.wdk.model.report.Reporter;
+import org.gusdb.wdk.model.report.ReporterConfigException;
 import org.apidb.apicommon.model.report.ai.expression.Summarizer;
+import org.gusdb.wdk.model.WdkModelException;
 
-public class SingleGeneAiExpressionReporter extends AbstractReporter {
+import org.json.JSONObject;
+import java.io.IOException;
+import java.io.OutputStream;
 
-    // configure: is any config needed?
+public class SingleGeneAiExpressionReporter extends AbstractReporter {    
+
+  private enum CacheMode {
+    TEST("test"),
+    POPULATE("populate");
+    private final String mode;
+    CacheMode(String mode) {
+      this.mode = mode;
+    }
+    public String getMode() {
+      return mode;
+    }
+    public static CacheMode fromString(String mode) throws IllegalArgumentException {
+      for (CacheMode cm : CacheMode.values()) {
+        if (cm.mode.equalsIgnoreCase(mode)) {
+          return cm;
+        }
+      }
+      throw new IllegalArgumentException("Invalid CacheMode: " + mode);
+    }
+  }
+
+  private CacheMode _cacheMode = CacheMode.TEST;
+    
+  @Override
+  public Reporter configure(JSONObject config) throws ReporterConfigException, WdkModelException {
+    try {
+	    _cacheMode = CacheMode.fromString(config.getString("cacheMode"));
+    } catch (IllegalArgumentException e) {
+	    throw new ReporterConfigException("Invalid cacheMode value: " + config.getString("cacheMode"), e);
+    }
+    return this;
+  }
+
+  @Override
+  protected void write(OutputStream out) throws IOException, WdkModelException {
+
+  }
+  
 
-    // write: does the business - see SingleGeneReporter for example
 }
 
+
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 65117fc60..d94a03e58 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -103,16 +103,9 @@ public class Summarizer {
 			       )
 	.build();
    
-    public static JSONObject summariseExpression(String geneId, String projectId, String serviceBaseUrl) {
-        System.out.println("Summarising expression for Gene ID: " + geneId + " with model: " + model.toString());
+    public static JSONObject summariseExpression(ExpressionData expressionData) {
         
-        // Placeholder for the actual implementation
-        System.out.println("Fetching data from: " + serviceBaseUrl);
-
 	try {
-            // Call the API client to fetch expression data
-            ExpressionData expressionData = WdkClient.fetchExpressionData(serviceBaseUrl, geneId, projectId);
-            
             // Print retrieved data (debugging)
             System.out.println("Expression Graphs: " + expressionData.getExpressionGraphs().size());
             System.out.println("Expression Graphs Data Table: " + expressionData.getExpressionGraphsDataTable().size());
@@ -161,27 +154,17 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
 
 
 	
-	// We don't need to send the dataset_id to the AI but it's useful to have in the response for phase two
+	// We don't need to send the dataset_id to the AI but it's useful to have in the
+	// response for phase two - so we save it for later
 	JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
 	String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null;
 	experimentForAI.remove("dataset_id");
-
-// specific experimental fixes for "DS_2e639b71f6"
-//	experimentForAI.put("display_name", "Transcriptional profiling of male head comparing swarming mosquito with control non-swarming mosquito (Anopheles coluzzii, aligned to A. gambiae PEST strain)");
-//	experimentForAI.put("y_axis", "Expression Values for 2 channel microarray experiments are log ratios.");
-//	experimentForAI.put("description", "Anopheles coluzzii mosquitoes were collected in July in Vallée du Kou, Bobo-Dioulasso, Burkina Faso in 2011. Mosquitoes, mostly males, were collected in swarms using sweeping net during dusk. The indoor resting (nonswarming) males with antennal fibrillae becoming erect were collected in inhabited houses using vacuum aspiration just prior to swarming time. The collected mosquitoes were placed in tubes containing RNAlater to prevent RNA degradation. Mosquito species was molecularly identified by SINE-PCR. Total RNA from 50 male mosquito heads was isolated. Both swarm male heads and indoor resting male heads were used as samples. Laboratory reared 2-6-day old virgin An. gambiae s.s. male heads were used as reference samples (control). Three biological replicates were performed for each group.    \nMicroarray analysis: Cy5- and Cy3-labeled cRNA probes were generated from 200 ng of RNA using Agilent Technologies Low Input Quick Amp Labeling Kit according to the manufacturer's instructions. Probe hybridization to the microarray slides was performed with 2 μg cRNA probes.   ");
 	
-	String message = """
-The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:
-
-```json
-%s
-```
-
-**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.
-**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.
-**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields.
-""".formatted(experimentForAI.toString());
+	String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" +
+	    "```json\n%s\n```\n\n".formatted(experimentForAI.toString()) +
+	    "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" +
+	    "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" +
+	    "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields.";
 
 //   System.out.println(message); /// DEBUG
 
@@ -219,15 +202,9 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
 
     private static JSONObject sendExperimentSummariesToOpenAI(List<JSONObject> experiments) {
 	
-	String message = """
-Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:
-
-```json
-%s
-```
-
-Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.
-""".formatted(new JSONArray(experiments));
+	String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
+	    "```json\n%s\n```\n\n".formatted(new JSONArray(experiments)) +
+	    "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.";
 
         ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
                 .model(model)

From 089511c5203031143fc162bbbef28868c110efae Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Thu, 13 Feb 2025 23:11:52 +0000
Subject: [PATCH 03/31] all wired together and compiles

---
 .../ai/SingleGeneAiExpressionReporter.java    |  33 +-
 .../ai/expression/ExperimentProcessor.java    |  75 ---
 .../ai/expression/GeneRecordProcessor.java    |  83 ++++
 .../report/ai/expression/Summarizer.java      | 450 +++++++++---------
 4 files changed, 337 insertions(+), 304 deletions(-)
 delete mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java
 create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index 06e18d3d3..f424f3c66 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -5,14 +5,25 @@
 import org.gusdb.wdk.model.report.ReporterConfigException;
 import org.apidb.apicommon.model.report.ai.expression.Summarizer;
 import org.gusdb.wdk.model.WdkModelException;
+import org.gusdb.wdk.model.record.RecordClass;
+import org.gusdb.wdk.model.record.RecordInstance;
+import org.gusdb.wdk.model.answer.stream.RecordStream;
+import org.gusdb.wdk.model.answer.stream.RecordStreamFactory;
+import org.apidb.apicommon.model.TranscriptUtil;
+import org.gusdb.wdk.model.record.TableField;
+import org.gusdb.wdk.model.WdkModelException;
+import org.gusdb.wdk.model.WdkUserException;
 
 import org.json.JSONObject;
 import java.io.IOException;
 import java.io.OutputStream;
+import java.util.Map;
+import java.util.List;
+import java.util.stream.Collectors;
 
 public class SingleGeneAiExpressionReporter extends AbstractReporter {    
 
-  private enum CacheMode {
+  public enum CacheMode {
     TEST("test"),
     POPULATE("populate");
     private final String mode;
@@ -37,7 +48,9 @@ public static CacheMode fromString(String mode) throws IllegalArgumentException
   @Override
   public Reporter configure(JSONObject config) throws ReporterConfigException, WdkModelException {
     try {
-	    _cacheMode = CacheMode.fromString(config.getString("cacheMode"));
+      if (config.has("cacheMode")) {
+        _cacheMode = CacheMode.fromString(config.getString("cacheMode"));
+      }
     } catch (IllegalArgumentException e) {
 	    throw new ReporterConfigException("Invalid cacheMode value: " + config.getString("cacheMode"), e);
     }
@@ -46,7 +59,23 @@ public Reporter configure(JSONObject config) throws ReporterConfigException, Wdk
 
   @Override
   protected void write(OutputStream out) throws IOException, WdkModelException {
+    RecordClass geneRecordClass = TranscriptUtil.getGeneRecordClass(_wdkModel);
+    Map<String, TableField> tableFields = geneRecordClass.getTableFieldMap();
+    List<TableField> tables = List.of("ExpressionGraphs", "ExpressionGraphsDataTable").stream()
+      .map(name -> tableFields.get(name))
+      .collect(Collectors.toList());
+
+    try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables)) {
+      RecordInstance singleRecord = recordStream.iterator().next();
+      JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord);
+      out.write(expressionSummary.toString().getBytes());
+      out.flush();
+    }
+    catch (WdkUserException e) {
+      throw new WdkModelException(e);
+    }
 
+    
   }
   
 
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java
deleted file mode 100644
index 2ba4b6fc6..000000000
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExperimentProcessor.java
+++ /dev/null
@@ -1,75 +0,0 @@
-package org.apidb.apicommon.model.report.ai.expression;
-
-import org.json.JSONArray;
-import org.json.JSONObject;
-import java.util.ArrayList;
-import java.util.List;
-import java.util.Set;
-import java.util.stream.Collectors;
-
-public class ExperimentProcessor {
-    private static final Set<String> KEYS_TO_KEEP = Set.of(
-	"y_axis", "description", "genus_species", "project_id", "summary", "dataset_id",
-        "assay_type", "x_axis", "module", "dataset_name", "display_name", "short_attribution", "paralog_number"
-    );
-
-
-    public static List<JSONObject> processExpressionData(ExpressionData expressionData) {
-	return processExpressionData(expressionData, 0);
-    }
-
-    // for debugging only
-    public static List<JSONObject> processExpressionData(ExpressionData expressionData, String datasetId) {
-        List<JSONObject> experiments = processExpressionData(expressionData, 0);
-        return experiments.stream()
-                .filter(experiment -> datasetId.equals(experiment.getString("dataset_id")))
-                .collect(Collectors.toList());
-    }
-
-    // maxExperiments is for dev/debugging only
-    public static List<JSONObject> processExpressionData(ExpressionData expressionData, int maxExperiments) {
-        List<JSONObject> experiments = new ArrayList<>();
-
-        for (JSONObject expressionGraph : expressionData.getExpressionGraphs()) {
-            String datasetId = expressionGraph.getString("dataset_id");
-
-            // Extract only relevant keys from expressionGraph
-            JSONObject experimentInfo = new JSONObject();
-            for (String key : KEYS_TO_KEEP) {
-                if (expressionGraph.has(key)) {
-                    experimentInfo.put(key, expressionGraph.get(key));
-                }
-            }
-
-            // Filter expressionGraphsDataTable to match dataset_id
-            List<JSONObject> filteredData = new ArrayList<>();
-            for (JSONObject entry : expressionData.getExpressionGraphsDataTable()) {
-                if (datasetId.equals(entry.getString("dataset_id"))) {
-                    JSONObject dataEntry = new JSONObject();
-                    dataEntry.put("sample_name", entry.getString("sample_name"));
-                    dataEntry.put("value", entry.get("value"));
-                    if (entry.has("standard_error")) {
-                        dataEntry.put("standard_error", entry.get("standard_error"));
-                    }
-                    if (entry.has("percentile_channel1")) {
-                        dataEntry.put("percentile_channel1", entry.get("percentile_channel1"));
-                    }
-                    if (entry.has("percentile_channel2")) {
-                        dataEntry.put("percentile_channel2", entry.get("percentile_channel2"));
-                    }
-                    filteredData.add(dataEntry);
-                }
-            }
-
-            // Combine and store experiment data
-	    experimentInfo.put("data", filteredData);
-	    experiments.add(experimentInfo);
-	    
-	    if (maxExperiments > 0 && experiments.size() >= maxExperiments) {
-		break;
-	    }
-        }
-
-        return experiments;
-    }
-}
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
new file mode 100644
index 000000000..0bb32c745
--- /dev/null
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -0,0 +1,83 @@
+package org.apidb.apicommon.model.report.ai.expression;
+
+import org.gusdb.wdk.model.record.RecordInstance;
+import org.gusdb.wdk.model.record.TableValue;
+import org.gusdb.wdk.model.record.TableValueRow;
+import org.gusdb.wdk.model.WdkUserException;
+import org.gusdb.wdk.model.WdkModelException;
+
+import org.json.JSONArray;
+import org.json.JSONObject;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.stream.Collectors;
+import java.util.stream.StreamSupport;
+
+/**
+ * expects a geneRecord with two tables: "ExpressionGraphs" and "ExpressionGraphsDataTable"
+ *
+ * returns a list of JSON Objects of data ready to feed the AI
+ */
+
+public class GeneRecordProcessor {
+  private static final Set<String> KEYS_TO_KEEP =
+    Set.of(
+           "y_axis", "description", "genus_species", "project_id", "summary", "dataset_id",
+           "assay_type", "x_axis", "module", "dataset_name", "display_name", "short_attribution", "paralog_number"
+           );
+
+  public static List<JSONObject> processExpressionData(RecordInstance geneRecord) throws WdkModelException, WdkUserException {
+    return processExpressionData(geneRecord, 0);
+  }
+
+  // for debugging only
+  public static List<JSONObject> processExpressionData(RecordInstance geneRecord, String datasetId) throws WdkModelException, WdkUserException {
+    List<JSONObject> experiments = processExpressionData(geneRecord, 0);
+    return experiments.stream()
+      .filter(experiment -> datasetId.equals(experiment.getString("dataset_id")))
+      .collect(Collectors.toList());
+  }
+
+  // maxExperiments is for dev/debugging only
+  public static List<JSONObject> processExpressionData(RecordInstance geneRecord, int maxExperiments) throws WdkModelException, WdkUserException {
+    // return value:
+    List<JSONObject> experiments = new ArrayList<>();
+
+    TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs");
+    TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable");
+
+    for (TableValueRow experimentRow : expressionGraphs) {
+      JSONObject experimentInfo = new JSONObject();
+    
+      // Extract all relevant attributes
+      for (String key : KEYS_TO_KEEP) {
+        experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue());
+      }
+
+      List<JSONObject> filteredData = new ArrayList<>();
+      String datasetId = experimentRow.getAttributeValue("dataset_id").getValue();
+      // add data from `expressionGraphsDataTable` where attribute "dataset_id" equals `datasetId`
+      List<TableValueRow> thisExperimentDataRows = new ArrayList<>();
+      for (TableValueRow dataRow : expressionGraphsDataTable) {
+        if (dataRow.getAttributeValue("dataset_id").getValue().equals(datasetId)) {
+          JSONObject dataEntry = new JSONObject();
+
+          // Extract relevant numeric fields
+          List<String> dataKeys = List.of("value", "standard_error", "percentile_channel1", "percentile_channel2", "sample_name");
+          for (String key : dataKeys) {
+            dataEntry.put(key, dataRow.getAttributeValue(key).getValue());
+          }
+
+          filteredData.add(dataEntry);
+        }
+      }
+
+      experimentInfo.put("data", filteredData);
+      experiments.add(experimentInfo);
+    
+      if (maxExperiments > 0 && experiments.size() >= maxExperiments) break;
+    }
+    return experiments;
+  }
+}
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index d94a03e58..98961b7f1 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -1,8 +1,8 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
-//
-// TO DO - add deps to pom.xml
-//
+import org.gusdb.wdk.model.record.RecordInstance;
+import org.gusdb.wdk.model.WdkUserException;
+import org.gusdb.wdk.model.WdkModelException;
 
 import org.json.JSONObject;
 import org.json.JSONArray;
@@ -25,273 +25,269 @@
 import java.util.stream.Collectors;
 
 public class Summarizer {
-    private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder()
-            .fromEnv()  // Uses OPENAI_API_KEY from env
-            .maxRetries(32)  // Handle 429 errors
-            .build();
-
-    // provide exact model number for semi-reproducibility
-    private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
-    private static int MAX_RESPONSE_TOKENS = 5000;
+  private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder()
+    .fromEnv()  // Uses OPENAI_API_KEY from env
+    .maxRetries(32)  // Handle 429 errors
+    .build();
+
+  // provide exact model number for semi-reproducibility
+  private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
+  private static int MAX_RESPONSE_TOKENS = 5000;
     
-    private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data";
-
-    // Prepare JSON schemas for structured responses
-    private static final JsonSchema.Schema experimentResponseSchema =
-	JsonSchema.Schema.builder()
-	.putAdditionalProperty("type", JsonValue.from("object"))
-	.putAdditionalProperty("properties",
-			       JsonValue
-			       .from(Map
-				     .of(
-					 "one_sentence_summary", Map.of("type", "string"),
-					 "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5),
-					 "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5),
-					 "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), 
-					 "notes", Map.of("type", "string")
-					 )
-				     )
-			       )
-	.putAdditionalProperty("required",
-			       JsonValue.from(
-					      List.of(
-						      "one_sentence_summary",
-						      "biological_importance",
-						      "confidence",
-						      "experiment_keywords",
-						      "notes")
-					      )
-			       )
-	.build();
-
-     private static final JsonSchema.Schema finalResponseSchema =
-	JsonSchema.Schema.builder()
-	.putAdditionalProperty("type", JsonValue.from("object"))
-	.putAdditionalProperty("properties",
-			       JsonValue
-			       .from(Map
-				     .of(
-					 "headline", Map.of("type", "string"),
-					 "one_paragraph_summary", Map.of("type", "string"),
-					 "sections",
-					 Map.of("type", "array",
-						"minimum", 1,
-						"items",
-						Map.of(
-						       "type", "object",
-						       "required", List.of("headline", "one_sentence_summary", "dataset_ids"),
-						       "properties",
-						       Map.of(
-							      "headline", Map.of("type", "string"),
-							      "one_sentence_summary", Map.of("type", "string"),
-							      "dataset_ids", Map.of("type", "array",
-										    "items", Map.of("type", "string"))
-							      )
-						       )
-						)
-					 )
-				     )
-			       )
-	.putAdditionalProperty("required",
-			       JsonValue.from(
-					      List.of(
-						      "headline",
-						      "one_paragraph_summary",
-						      "dataset_ids"
-						      )
-					      )
-			       )
-	.build();
+  private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data";
+
+  // Prepare JSON schemas for structured responses
+  // NOTE: this code is horrible to look at/read. It would be better to just define the schemas as JSON strings
+  // but this is only really nice when we have """ text block """ support, coming soon when we upgrade, perhaps?
+  private static final JsonSchema.Schema experimentResponseSchema =
+    JsonSchema.Schema.builder()
+    .putAdditionalProperty("type", JsonValue.from("object"))
+    .putAdditionalProperty("properties",
+                           JsonValue
+                           .from(Map
+                                 .of(
+                                     "one_sentence_summary", Map.of("type", "string"),
+                                     "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5),
+                                     "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5),
+                                     "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), 
+                                     "notes", Map.of("type", "string")
+                                     )
+                                 )
+                           )
+    .putAdditionalProperty("required",
+                           JsonValue.from(
+                                          List.of(
+                                                  "one_sentence_summary",
+                                                  "biological_importance",
+                                                  "confidence",
+                                                  "experiment_keywords",
+                                                  "notes")
+                                          )
+                           )
+    .build();
+
+  private static final JsonSchema.Schema finalResponseSchema =
+    JsonSchema.Schema.builder()
+    .putAdditionalProperty("type", JsonValue.from("object"))
+    .putAdditionalProperty("properties",
+                           JsonValue
+                           .from(Map
+                                 .of(
+                                     "headline", Map.of("type", "string"),
+                                     "one_paragraph_summary", Map.of("type", "string"),
+                                     "sections",
+                                     Map.of("type", "array",
+                                            "minimum", 1,
+                                            "items",
+                                            Map.of(
+                                                   "type", "object",
+                                                   "required", List.of("headline", "one_sentence_summary", "dataset_ids"),
+                                                   "properties",
+                                                   Map.of(
+                                                          "headline", Map.of("type", "string"),
+                                                          "one_sentence_summary", Map.of("type", "string"),
+                                                          "dataset_ids", Map.of("type", "array",
+                                                                                "items", Map.of("type", "string"))
+                                                          )
+                                                   )
+                                            )
+                                     )
+                                 )
+                           )
+    .putAdditionalProperty("required",
+                           JsonValue.from(
+                                          List.of(
+                                                  "headline",
+                                                  "one_paragraph_summary",
+                                                  "dataset_ids"
+                                                  )
+                                          )
+                           )
+    .build();
    
-    public static JSONObject summariseExpression(ExpressionData expressionData) {
+  public static JSONObject summarizeExpression(RecordInstance geneRecord) throws WdkUserException  {
         
-	try {
-            // Print retrieved data (debugging)
-            System.out.println("Expression Graphs: " + expressionData.getExpressionGraphs().size());
-            System.out.println("Expression Graphs Data Table: " + expressionData.getExpressionGraphsDataTable().size());
-
-            // Process expression data further into a list of pruned metadata plus data
-	    List<JSONObject> experimentsWithData = ExperimentProcessor.processExpressionData(expressionData);
-            System.out.println("Pre-processed Experiments: " + experimentsWithData.size());
+    try {
+      // Process expression data further into a list of pruned metadata plus data
+	    List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord);
+      System.out.println("Pre-processed Experiments: " + experimentsWithData.size());
 	    
-            // Send AI requests in parallel
+      // Send AI requests in parallel
 	    // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI
-            List<CompletableFuture<JSONObject>> aiRequests = experimentsWithData.stream()
-                .map(Summarizer::sendExperimentToOpenAI)
-                .collect(Collectors.toList());
-            // Wait for all requests to complete
-            List<JSONObject> responses = aiRequests.stream()
-                .map(CompletableFuture::join)  // Blocks until each completes
-                .collect(Collectors.toList());
-
-            // Debug output
+      List<CompletableFuture<JSONObject>> aiRequests = experimentsWithData.stream()
+        .map(Summarizer::sendExperimentToOpenAI)
+        .collect(Collectors.toList());
+      // Wait for all requests to complete
+      List<JSONObject> responses = aiRequests.stream()
+        .map(CompletableFuture::join)  // Blocks until each completes
+        .collect(Collectors.toList());
+
+      // Debug output
 	    // System.out.println("Individual responses:");
-            // responses.forEach(response -> System.out.println(response.toString(2)));
+      // responses.forEach(response -> System.out.println(response.toString(2)));
 	    // System.exit(0);
 	    
 	    JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses);
 	    return finalSummary;
 
-        } catch (Exception e) {
-            // Handle errors gracefully
-            System.err.println("Error fetching expression data: " + e.getMessage());
-            e.printStackTrace(); // Print stack trace for debugging
-        }
-	
-	return null;
+    } catch (WdkModelException e) {
+      // Handle errors gracefully
+      System.err.println("Error fetching expression data: " + e.getMessage());
+      throw new WdkUserException(e);
     }
+  }
 
-    private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject experiment) {
+  private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject experiment) {
 
-	// Possible TO DO: AI EDIT DESCRIPTION
-	// Before sending the experiment+data to the AI, ask the AI to edit the `description` field
-	// as follows: (This should be cached by dataset_id only and would be called once per organism
-	// and would reduce tokens and "cognitive load" a little bit for the next step.)
-	//
-	// "Edit the following text to so that it **only** describes the experimental design of the transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, especially not any genes or groups of genes and their expression behaviour."
-	//
-	// We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below.
+    // Possible TO DO: AI EDIT DESCRIPTION
+    // Before sending the experiment+data to the AI, ask the AI to edit the `description` field
+    // as follows: (This should be cached by dataset_id only and would be called once per organism
+    // and would reduce tokens and "cognitive load" a little bit for the next step.)
+    //
+    // "Edit the following text to so that it **only** describes the experimental design of the transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, especially not any genes or groups of genes and their expression behaviour."
+    //
+    // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below.
 
 
 	
-	// We don't need to send the dataset_id to the AI but it's useful to have in the
-	// response for phase two - so we save it for later
-	JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
-	String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null;
-	experimentForAI.remove("dataset_id");
+    // We don't need to send the dataset_id to the AI but it's useful to have in the
+    // response for phase two - so we save it for later
+    JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
+    String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null;
+    experimentForAI.remove("dataset_id");
 	
-	String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" +
+    String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" +
 	    "```json\n%s\n```\n\n".formatted(experimentForAI.toString()) +
 	    "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" +
 	    "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" +
 	    "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields.";
 
-//   System.out.println(message); /// DEBUG
-
-        ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
-                .model(model)
-                .maxCompletionTokens(MAX_RESPONSE_TOKENS)
-                .responseFormat(ResponseFormatJsonSchema.builder()
-                        .jsonSchema(JsonSchema.builder()
-                                .name("experiment-summary")
-                                .schema(experimentResponseSchema)
-                                .build())
-                        .build())
-                .addSystemMessage(systemMessage)
-                .addUserMessage(message)
+    //   System.out.println(message); /// DEBUG
+
+    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
+      .model(model)
+      .maxCompletionTokens(MAX_RESPONSE_TOKENS)
+      .responseFormat(ResponseFormatJsonSchema.builder()
+                      .jsonSchema(JsonSchema.builder()
+                                  .name("experiment-summary")
+                                  .schema(experimentResponseSchema)
+                                  .build())
+                      .build())
+      .addSystemMessage(systemMessage)
+      .addUserMessage(message)
 	    // .temperature(1.0)
-                .build();
+      .build();
 
-	// add dataset_id back to the response
-        return openAIClient.chat().completions().create(request)
+    // add dataset_id back to the response
+    return openAIClient.chat().completions().create(request)
 	    .thenApply(completion -> {
-		    // response is a JSON string
-		    String jsonString = completion.choices().get(0).message().content().get();
-		    try {
-			JSONObject jsonObject = new JSONObject(jsonString);
-			jsonObject.put("dataset_id", datasetId);
-			return jsonObject;
-		    } catch (JSONException e) {
-			System.err.println("Error parsing JSON response for dataset " + datasetId + ": " + e.getMessage());
-			System.err.println("Raw response: " + jsonString);
-			return new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId);
-		    }
-                });
-    }
-
-
-    private static JSONObject sendExperimentSummariesToOpenAI(List<JSONObject> experiments) {
+          // response is a JSON string
+          String jsonString = completion.choices().get(0).message().content().get();
+          try {
+            JSONObject jsonObject = new JSONObject(jsonString);
+            jsonObject.put("dataset_id", datasetId);
+            return jsonObject;
+          } catch (JSONException e) {
+            System.err.println("Error parsing JSON response for dataset " + datasetId + ": " + e.getMessage());
+            System.err.println("Raw response: " + jsonString);
+            return new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId);
+          }
+        });
+  }
+
+
+  private static JSONObject sendExperimentSummariesToOpenAI(List<JSONObject> experiments) {
 	
-	String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
+    String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
 	    "```json\n%s\n```\n\n".formatted(new JSONArray(experiments)) +
 	    "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.";
 
-        ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
-                .model(model)
-                .maxCompletionTokens(MAX_RESPONSE_TOKENS)
-                .responseFormat(ResponseFormatJsonSchema.builder()
-                        .jsonSchema(JsonSchema.builder()
-                                .name("expression-summary")
-                                .schema(finalResponseSchema)
-                                .build())
-                        .build())
-                .addSystemMessage(systemMessage)
-                .addUserMessage(message)
-                .build();
-
-	// System.out.println(message);
-
-	ChatCompletion completion = openAIClient.chat().completions().create(request).join(); // join() waits for the async response
-        String jsonString = completion.choices().get(0).message().content().get();
-	JSONObject rawResponseObject = new JSONObject(jsonString);
-
-	// TO DO - quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI
-	JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments);
+    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
+      .model(model)
+      .maxCompletionTokens(MAX_RESPONSE_TOKENS)
+      .responseFormat(ResponseFormatJsonSchema.builder()
+                      .jsonSchema(JsonSchema.builder()
+                                  .name("expression-summary")
+                                  .schema(finalResponseSchema)
+                                  .build())
+                      .build())
+      .addSystemMessage(systemMessage)
+      .addUserMessage(message)
+      .build();
+
+    // System.out.println(message);
+
+    ChatCompletion completion = openAIClient.chat().completions().create(request).join(); // join() waits for the async response
+    String jsonString = completion.choices().get(0).message().content().get();
+    JSONObject rawResponseObject = new JSONObject(jsonString);
+
+    // TO DO - quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI
+    JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments);
 	
-	return finalResponseObject;
-    }
+    return finalResponseObject;
+  }
 
     
-    public static JSONObject consolidateSummary(JSONObject summaryResponse, List<JSONObject> individualResults) {
-        // Gather all dataset IDs from individualResults and map them to summaries
-        Map<String, JSONObject> datasetSummaries = new HashMap<>();
-        for (JSONObject result : individualResults) {
-            datasetSummaries.put(result.getString("dataset_id"), result);
-        }
+  public static JSONObject consolidateSummary(JSONObject summaryResponse, List<JSONObject> individualResults) {
+    // Gather all dataset IDs from individualResults and map them to summaries
+    Map<String, JSONObject> datasetSummaries = new HashMap<>();
+    for (JSONObject result : individualResults) {
+      datasetSummaries.put(result.getString("dataset_id"), result);
+    }
 
-        Set<String> seenDatasetIds = new HashSet<>();
-        JSONArray deduplicatedSections = new JSONArray();
-        JSONArray sections = summaryResponse.getJSONArray("sections");
+    Set<String> seenDatasetIds = new HashSet<>();
+    JSONArray deduplicatedSections = new JSONArray();
+    JSONArray sections = summaryResponse.getJSONArray("sections");
 
-        for (int i = 0; i < sections.length(); i++) {
-            JSONObject section = sections.getJSONObject(i);
-            JSONArray datasetIds = section.getJSONArray("dataset_ids");
-            JSONArray summaries = new JSONArray();
+    for (int i = 0; i < sections.length(); i++) {
+      JSONObject section = sections.getJSONObject(i);
+      JSONArray datasetIds = section.getJSONArray("dataset_ids");
+      JSONArray summaries = new JSONArray();
 
-            for (int j = 0; j < datasetIds.length(); j++) {
-                String id = datasetIds.getString(j);
+      for (int j = 0; j < datasetIds.length(); j++) {
+        String id = datasetIds.getString(j);
                 
-                // Warn and skip if the id doesn't exist
-                if (!datasetSummaries.containsKey(id)) {
-                    System.out.println("WARNING: summary section id '" + id + "' does not exist. Excluding from final output.");
-                    continue;
-                }
-                // Skip if we've seen it
-                if (seenDatasetIds.contains(id)) continue;
+        // Warn and skip if the id doesn't exist
+        if (!datasetSummaries.containsKey(id)) {
+          System.out.println("WARNING: summary section id '" + id + "' does not exist. Excluding from final output.");
+          continue;
+        }
+        // Skip if we've seen it
+        if (seenDatasetIds.contains(id)) continue;
                 
-                seenDatasetIds.add(id);
-                summaries.put(datasetSummaries.get(id));
-            }
+        seenDatasetIds.add(id);
+        summaries.put(datasetSummaries.get(id));
+      }
             
-            // Update section with mapped summaries and remove dataset_ids key
-            section.put("summaries", summaries);
-            section.remove("dataset_ids");
-            deduplicatedSections.put(section);
-        }
+      // Update section with mapped summaries and remove dataset_ids key
+      section.put("summaries", summaries);
+      section.remove("dataset_ids");
+      deduplicatedSections.put(section);
+    }
 
-        // Find missing dataset IDs
-        Set<String> missingDatasetIds = new HashSet<>(datasetSummaries.keySet());
-        missingDatasetIds.removeAll(seenDatasetIds);
+    // Find missing dataset IDs
+    Set<String> missingDatasetIds = new HashSet<>(datasetSummaries.keySet());
+    missingDatasetIds.removeAll(seenDatasetIds);
 
-        // If there are missing IDs, add an "Others" section
-        if (!missingDatasetIds.isEmpty()) {
-            JSONArray otherSummaries = new JSONArray();
-            for (String id : missingDatasetIds) {
-                otherSummaries.put(datasetSummaries.get(id));
-            }
+    // If there are missing IDs, add an "Others" section
+    if (!missingDatasetIds.isEmpty()) {
+      JSONArray otherSummaries = new JSONArray();
+      for (String id : missingDatasetIds) {
+        otherSummaries.put(datasetSummaries.get(id));
+      }
             
-            JSONObject otherSection = new JSONObject();
-            otherSection.put("headline", "Other");
-            otherSection.put("one_sentence_summary", "These experiments were not grouped into sub-sections by the AI.");
-            otherSection.put("summaries", otherSummaries);
-            deduplicatedSections.put(otherSection);
-        }
-
-        // Create final deduplicated summary
-        JSONObject finalSummary = new JSONObject(summaryResponse.toString());
-        finalSummary.put("sections", deduplicatedSections);
-        return finalSummary;
+      JSONObject otherSection = new JSONObject();
+      otherSection.put("headline", "Other");
+      otherSection.put("one_sentence_summary", "These experiments were not grouped into sub-sections by the AI.");
+      otherSection.put("summaries", otherSummaries);
+      deduplicatedSections.put(otherSection);
     }
 
+    // Create final deduplicated summary
+    JSONObject finalSummary = new JSONObject(summaryResponse.toString());
+    finalSummary.put("sections", deduplicatedSections);
+    return finalSummary;
+  }
+
 }
 

From 5752cc496bbb04350cbbb288b0f42f691bcb72cb Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Thu, 13 Feb 2025 23:18:59 +0000
Subject: [PATCH 04/31] extra comment

---
 .../model/report/ai/SingleGeneAiExpressionReporter.java          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index f424f3c66..b355b0a3e 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -67,6 +67,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
 
     try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables)) {
       RecordInstance singleRecord = recordStream.iterator().next();
+      // we will need to pass `_cacheMode` to `summarizeExpression()`...
       JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord);
       out.write(expressionSummary.toString().getBytes());
       out.flush();

From 4ce3f5fa6e4dd129267bf254d985c37ade53ba9c Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Thu, 13 Feb 2025 23:29:33 +0000
Subject: [PATCH 05/31] one more comment

---
 .../model/report/ai/expression/GeneRecordProcessor.java          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index 0bb32c745..624216b72 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -58,6 +58,7 @@ public static List<JSONObject> processExpressionData(RecordInstance geneRecord,
       List<JSONObject> filteredData = new ArrayList<>();
       String datasetId = experimentRow.getAttributeValue("dataset_id").getValue();
       // add data from `expressionGraphsDataTable` where attribute "dataset_id" equals `datasetId`
+      // (this would be more efficient with a `Map<String, List<TableValueRow>>` made before the `expressionGraphs` loop)
       List<TableValueRow> thisExperimentDataRows = new ArrayList<>();
       for (TableValueRow dataRow : expressionGraphsDataTable) {
         if (dataRow.getAttributeValue("dataset_id").getValue().equals(datasetId)) {

From eb203df748816055c55e30d39ae5c49d313cb45f Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Mon, 17 Feb 2025 12:43:06 +0000
Subject: [PATCH 06/31] move CacheMode into separate file

---
 .../apicommon/model/report/ai/CacheMode.java  | 26 +++++++++++++++++++
 .../ai/SingleGeneAiExpressionReporter.java    | 22 +---------------
 .../report/ai/expression/Summarizer.java      |  4 ++-
 3 files changed, 30 insertions(+), 22 deletions(-)
 create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java
new file mode 100644
index 000000000..d514ca813
--- /dev/null
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java
@@ -0,0 +1,26 @@
+package org.apidb.apicommon.model.report.ai;
+
+public enum CacheMode {
+
+  TEST("test"),
+  POPULATE("populate");
+
+  private final String mode;
+
+  CacheMode(String mode) {
+    this.mode = mode;
+  }
+
+  public String getMode() {
+    return mode;
+  }
+
+  public static CacheMode fromString(String mode) throws IllegalArgumentException {
+    for (CacheMode cm : CacheMode.values()) {
+      if (cm.mode.equalsIgnoreCase(mode)) {
+	return cm;
+      }
+    }
+    throw new IllegalArgumentException("Invalid CacheMode: " + mode);
+  }
+}
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index b355b0a3e..9b8a0f336 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -23,26 +23,6 @@
 
 public class SingleGeneAiExpressionReporter extends AbstractReporter {    
 
-  public enum CacheMode {
-    TEST("test"),
-    POPULATE("populate");
-    private final String mode;
-    CacheMode(String mode) {
-      this.mode = mode;
-    }
-    public String getMode() {
-      return mode;
-    }
-    public static CacheMode fromString(String mode) throws IllegalArgumentException {
-      for (CacheMode cm : CacheMode.values()) {
-        if (cm.mode.equalsIgnoreCase(mode)) {
-          return cm;
-        }
-      }
-      throw new IllegalArgumentException("Invalid CacheMode: " + mode);
-    }
-  }
-
   private CacheMode _cacheMode = CacheMode.TEST;
     
   @Override
@@ -68,7 +48,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
     try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables)) {
       RecordInstance singleRecord = recordStream.iterator().next();
       // we will need to pass `_cacheMode` to `summarizeExpression()`...
-      JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord);
+      JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord, _cacheMode);
       out.write(expressionSummary.toString().getBytes());
       out.flush();
     }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 98961b7f1..009be461a 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -1,5 +1,7 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
+import org.apidb.apicommon.model.report.ai.CacheMode;
+
 import org.gusdb.wdk.model.record.RecordInstance;
 import org.gusdb.wdk.model.WdkUserException;
 import org.gusdb.wdk.model.WdkModelException;
@@ -105,7 +107,7 @@ public class Summarizer {
                            )
     .build();
    
-  public static JSONObject summarizeExpression(RecordInstance geneRecord) throws WdkUserException  {
+    public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException  {
         
     try {
       // Process expression data further into a list of pruned metadata plus data

From dfab346e1fc60eea9857f3c62f53a3a848b6c574 Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Tue, 18 Feb 2025 09:33:13 +0000
Subject: [PATCH 07/31] WIP cache wiring

---
 .../ai/expression/AiExpressionCache.java      | 109 ++++++++++++++++++
 .../ai/expression/GeneRecordProcessor.java    |   4 +-
 .../report/ai/expression/Summarizer.java      |  57 +++++++--
 3 files changed, 158 insertions(+), 12 deletions(-)
 create mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
new file mode 100644
index 000000000..76a1cd65d
--- /dev/null
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -0,0 +1,109 @@
+package org.apidb.apicommon.model.report.ai.expression;
+
+
+import java.io.IOException;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.Files;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+import java.util.HexFormat;
+import java.util.function.Function;
+import org.json.JSONObject;
+import org.json.JSONException;
+
+import org.gusdb.fgputil.cache.disk.OnDiskCache;
+import org.gusdb.fgputil.functional.FunctionalInterfaces.ConsumerWithException;
+import org.gusdb.fgputil.functional.FunctionalInterfaces.FunctionWithException;
+
+public class AiExpressionCache extends OnDiskCache {
+
+  // Default cache location and timing settings
+  private static final Path DEFAULT_CACHE_DIR = Paths.get("/tmp/expressionCache");
+  private static final long DEFAULT_TIMEOUT_MILLIS = 5000;
+  private static final long DEFAULT_POLL_FREQUENCY_MILLIS = 500;
+
+  // No-argument constructor using defaults
+  public AiExpressionCache() throws IOException {
+    super(DEFAULT_CACHE_DIR, DEFAULT_TIMEOUT_MILLIS, DEFAULT_POLL_FREQUENCY_MILLIS);
+  }
+
+  // Compute SHA-256 hash digest of input
+  private static String computeDigest(String input) throws NoSuchAlgorithmException {
+    MessageDigest digest = MessageDigest.getInstance("SHA-256");
+    byte[] hash = digest.digest(input.getBytes());
+    return HexFormat.of().formatHex(hash);
+  }
+
+  // Check if cached data is valid
+  public boolean isCacheValid(String cacheKey, String inputData) {
+
+    try {
+      FunctionWithException<Path, Boolean> visitor = entryDir -> {
+	  Path digestFile = entryDir.resolve("digest.txt");
+
+	  if (!Files.exists(digestFile)) {
+	    System.out.println("No digest file found.");
+	    return false;
+	  }
+
+	  // Read stored digest and compare
+	  String cachedDigest = Files.readString(digestFile);
+	  String computedDigest = computeDigest(inputData);
+
+	  if (cachedDigest.equals(computedDigest)) {
+	    System.out.println("Cache digest matches input.");
+	    return true;
+	  } else {
+	    System.out.println("Cache digest mismatch! Cache is out of date.");
+	    return false;
+	  }
+      };
+
+      return visitContent(cacheKey, visitor);
+
+    } catch (EntryNotCreatedException e) {
+      System.out.println("Cache entry does not exist yet.");
+      return false;
+    } catch (Exception e) {
+      throw new RuntimeException("Error validating cache entry", e);
+    }
+  }
+
+  // Populate cache with computed data (Method 1: Takes computedData directly)
+  public void populateCache(String cacheKey, String inputData, JSONObject computedData) throws Exception {
+    ConsumerWithException<Path> populator = entryDir -> {
+      Files.writeString(entryDir.resolve("cached_data.txt"), computedData.toString());
+      Files.writeString(entryDir.resolve("digest.txt"), computeDigest(inputData));
+    };
+
+    // Populate with overwrite policy (assumes caller ensures it's necessary)
+    populateAndProcessContent(cacheKey, populator, path -> null, Overwrite.YES);
+  }
+
+//  // Populate cache with a function that computes the result (Method 2: Uses a function)
+//  public void populateCache(String cacheKey, String inputData, Function<String, String> computation) throws Exception {
+//    populateCache(cacheKey, inputData, computation.apply(inputData));
+//  }
+
+
+  // Read cached data (throws IOException if missing)
+  public JSONObject readCachedData(String cacheKey) throws Exception {
+    FunctionWithException<Path, JSONObject> visitor = entryDir -> {
+      Path file = entryDir.resolve("cached_data.txt");
+      if (!Files.exists(file)) {
+	throw new IOException("Cache entry missing: " + file);
+      }
+      String fileContents = Files.readString(file);
+      try {
+	JSONObject jsonObject = new JSONObject(fileContents);
+	return jsonObject;
+      } catch (JSONException e) {
+	throw e;
+      }
+    };
+    
+    return visitContent(cacheKey, visitor);
+  }
+
+}
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index 624216b72..4916fd76c 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -44,12 +44,14 @@ public static List<JSONObject> processExpressionData(RecordInstance geneRecord,
     // return value:
     List<JSONObject> experiments = new ArrayList<>();
 
+    String geneId = geneRecord.getAttributeValue("gene_id").getValue();
     TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs");
     TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable");
 
     for (TableValueRow experimentRow : expressionGraphs) {
       JSONObject experimentInfo = new JSONObject();
-    
+      experimentInfo.put("gene_id", geneId);
+      
       // Extract all relevant attributes
       for (String key : KEYS_TO_KEEP) {
         experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue());
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 009be461a..b575677e5 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -1,6 +1,7 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
 import org.apidb.apicommon.model.report.ai.CacheMode;
+import org.apidb.apicommon.model.report.ai.expression.AiExpressionCache;
 
 import org.gusdb.wdk.model.record.RecordInstance;
 import org.gusdb.wdk.model.WdkUserException;
@@ -25,6 +26,7 @@
 import java.util.HashSet;
 import java.util.concurrent.CompletableFuture;
 import java.util.stream.Collectors;
+import java.io.IOException;
 
 public class Summarizer {
   private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder()
@@ -32,9 +34,21 @@ public class Summarizer {
     .maxRetries(32)  // Handle 429 errors
     .build();
 
+  private static final AiExpressionCache cache;
+
+  static {
+    AiExpressionCache tempCache = null;
+    try {
+      tempCache = new AiExpressionCache();
+    } catch (IOException e) {
+      throw new RuntimeException("Failed to initialize AiExpressionCache", e);
+    }
+    cache = tempCache;
+  }
+  
   // provide exact model number for semi-reproducibility
   private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
-  private static int MAX_RESPONSE_TOKENS = 5000;
+  private static int MAX_RESPONSE_TOKENS = 10000;
     
   private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data";
 
@@ -107,15 +121,15 @@ public class Summarizer {
                            )
     .build();
    
-    public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException  {
+  public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException  {
         
     try {
       // Process expression data further into a list of pruned metadata plus data
-	    List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord);
+      List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord);
       System.out.println("Pre-processed Experiments: " + experimentsWithData.size());
 	    
       // Send AI requests in parallel
-	    // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI
+      // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI
       List<CompletableFuture<JSONObject>> aiRequests = experimentsWithData.stream()
         .map(Summarizer::sendExperimentToOpenAI)
         .collect(Collectors.toList());
@@ -125,12 +139,12 @@ public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMod
         .collect(Collectors.toList());
 
       // Debug output
-	    // System.out.println("Individual responses:");
+      // System.out.println("Individual responses:");
       // responses.forEach(response -> System.out.println(response.toString(2)));
-	    // System.exit(0);
+      // System.exit(0);
 	    
-	    JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses);
-	    return finalSummary;
+      JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses);
+      return finalSummary;
 
     } catch (WdkModelException e) {
       // Handle errors gracefully
@@ -139,7 +153,12 @@ public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMod
     }
   }
 
+  
   private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject experiment) {
+    return sendExperimentToOpenAI(experiment, CacheMode.POPULATE);
+  }
+  
+  private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject experiment, CacheMode cacheMode) {
 
     // Possible TO DO: AI EDIT DESCRIPTION
     // Before sending the experiment+data to the AI, ask the AI to edit the `description` field
@@ -152,12 +171,15 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
 
 
 	
-    // We don't need to send the dataset_id to the AI but it's useful to have in the
-    // response for phase two - so we save it for later
+    // We don't need to send the gene_id or dataset_id to the AI but we need the gene ID
+    // for the cache key and it's useful to have dataset_id in the response for phase two
+    // - so we save them for later
     JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
+    String geneId = experimentForAI.has("gene_id") ? experimentForAI.getString("gene_id") : null;
+    experimentForAI.remove("gene_id");
     String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null;
     experimentForAI.remove("dataset_id");
-	
+    
     String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" +
 	    "```json\n%s\n```\n\n".formatted(experimentForAI.toString()) +
 	    "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" +
@@ -166,6 +188,19 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
 
     //   System.out.println(message); /// DEBUG
 
+    String cacheKey = geneId + ':' + datasetId;
+
+    if (cache.isCacheValid(cacheKey, message)) {
+      try {
+	return CompletableFuture.completedFuture(cache.readCachedData(cacheKey));
+//      } catch (IOException e) {
+//        // maybe log that the cache was unexpectedly invalidated
+//	// and then continue to compute and populate cache entry
+      } catch (Exception e) {
+	// do nothing
+      }
+    }
+    
     ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
       .model(model)
       .maxCompletionTokens(MAX_RESPONSE_TOKENS)

From 813d2f148b2773fcda9af6bb5572cab7e4621086 Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Tue, 18 Feb 2025 19:57:09 +0000
Subject: [PATCH 08/31] more cache wrangling

---
 .../ai/expression/GeneRecordProcessor.java    |   2 -
 .../report/ai/expression/Summarizer.java      | 102 ++++++++++++------
 2 files changed, 70 insertions(+), 34 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index 4916fd76c..1320fd709 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -44,13 +44,11 @@ public static List<JSONObject> processExpressionData(RecordInstance geneRecord,
     // return value:
     List<JSONObject> experiments = new ArrayList<>();
 
-    String geneId = geneRecord.getAttributeValue("gene_id").getValue();
     TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs");
     TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable");
 
     for (TableValueRow experimentRow : expressionGraphs) {
       JSONObject experimentInfo = new JSONObject();
-      experimentInfo.put("gene_id", geneId);
       
       // Extract all relevant attributes
       for (String key : KEYS_TO_KEEP) {
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index b575677e5..ba3ee6754 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -20,10 +20,12 @@
 import com.openai.models.ResponseFormatJsonSchema.JsonSchema;
 import com.openai.core.JsonValue;
 import java.util.List;
+import java.util.ArrayList;
 import java.util.Map;
 import java.util.HashMap;
 import java.util.Set;
 import java.util.HashSet;
+import java.util.Optional;
 import java.util.concurrent.CompletableFuture;
 import java.util.stream.Collectors;
 import java.io.IOException;
@@ -124,28 +126,57 @@ public class Summarizer {
   public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException  {
         
     try {
+      String geneId = geneRecord.getAttributeValue("gene_id").getValue();
+
       // Process expression data further into a list of pruned metadata plus data
       List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord);
       System.out.println("Pre-processed Experiments: " + experimentsWithData.size());
 	    
+      // TEST Mode: Collect valid cache entries
+      if (cacheMode == CacheMode.TEST) {
+	List<JSONObject> cachedResponses = new ArrayList<>();
+	
+	for (JSONObject experiment : experimentsWithData) {
+	  
+	  Optional<JSONObject> experimentSummary = Summarizer.sendExperimentToOpenAI(geneId, experiment, CacheMode.TEST).join();
+
+	  if (experimentSummary.isPresent()) {
+            cachedResponses.add(experimentSummary.get());
+	  } else {
+            return new JSONObject().put("cacheStatus", "miss"); // If any cache entry is missing, return early
+	  }
+	}
+
+	// All experiment-level caches are valid, now check final summary cache
+	Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, CacheMode.TEST);
+	return finalSummary
+	  .map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary))
+	  .orElseGet(() -> new JSONObject().put("cacheStatus", "miss"));
+      }
+
+
       // Send AI requests in parallel
-      // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI
-      List<CompletableFuture<JSONObject>> aiRequests = experimentsWithData.stream()
-        .map(Summarizer::sendExperimentToOpenAI)
+      List<CompletableFuture<Optional<JSONObject>>> aiRequests = experimentsWithData.stream()
+	// TO DO - potentially some optimisation?
+	// .map(exp -> CompletableFuture.supplyAsync(() -> sendExperimentToOpenAI(geneId, exp, CacheMode.POPULATE)))
+        .map(exp -> sendExperimentToOpenAI(geneId, exp, CacheMode.POPULATE))
         .collect(Collectors.toList());
-      // Wait for all requests to complete
+      // Wait for all requests to complete with `join`
       List<JSONObject> responses = aiRequests.stream()
-        .map(CompletableFuture::join)  // Blocks until each completes
-        .collect(Collectors.toList());
+	.map(CompletableFuture::join)  // Get Optional<JSONObject>
+	.filter(Optional::isPresent)   // Keep only non-empty results
+	.map(Optional::get)            // Extract JSONObject
+	.collect(Collectors.toList());
 
       // Debug output
       // System.out.println("Individual responses:");
       // responses.forEach(response -> System.out.println(response.toString(2)));
       // System.exit(0);
 	    
-      JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses);
-      return finalSummary;
-
+      Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, CacheMode.POPULATE);
+      return finalSummary
+	.map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary))
+	.orElseGet(() -> new JSONObject().put("cacheStatus", "miss"));
     } catch (WdkModelException e) {
       // Handle errors gracefully
       System.err.println("Error fetching expression data: " + e.getMessage());
@@ -154,11 +185,7 @@ public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMod
   }
 
   
-  private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject experiment) {
-    return sendExperimentToOpenAI(experiment, CacheMode.POPULATE);
-  }
-  
-  private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject experiment, CacheMode cacheMode) {
+  private static CompletableFuture<Optional<JSONObject>> sendExperimentToOpenAI(String geneId, JSONObject experiment, CacheMode cacheMode) {
 
     // Possible TO DO: AI EDIT DESCRIPTION
     // Before sending the experiment+data to the AI, ask the AI to edit the `description` field
@@ -171,12 +198,9 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
 
 
 	
-    // We don't need to send the gene_id or dataset_id to the AI but we need the gene ID
-    // for the cache key and it's useful to have dataset_id in the response for phase two
-    // - so we save them for later
+    // We don't need to send dataset_id to the AI but it's useful to have it
+    // in the response for phase two
     JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
-    String geneId = experimentForAI.has("gene_id") ? experimentForAI.getString("gene_id") : null;
-    experimentForAI.remove("gene_id");
     String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null;
     experimentForAI.remove("dataset_id");
     
@@ -192,15 +216,20 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
 
     if (cache.isCacheValid(cacheKey, message)) {
       try {
-	return CompletableFuture.completedFuture(cache.readCachedData(cacheKey));
-//      } catch (IOException e) {
-//        // maybe log that the cache was unexpectedly invalidated
-//	// and then continue to compute and populate cache entry
+	JSONObject cachedResponse = cache.readCachedData(cacheKey);
+	return CompletableFuture.completedFuture(Optional.of(cachedResponse));
       } catch (Exception e) {
-	// do nothing
+	System.err.println("Cache read failed for key " + cacheKey + ": " + e.getMessage());
+            
+	if (cacheMode == CacheMode.TEST) {
+	  return CompletableFuture.completedFuture(Optional.empty()); // Treat as cache miss
+	}
+	// Else, log and fall through to AI generation
       }
+    } else if (cacheMode == CacheMode.TEST) {
+      return CompletableFuture.completedFuture(Optional.empty());
     }
-    
+
     ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
       .model(model)
       .maxCompletionTokens(MAX_RESPONSE_TOKENS)
@@ -212,7 +241,6 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
                       .build())
       .addSystemMessage(systemMessage)
       .addUserMessage(message)
-	    // .temperature(1.0)
       .build();
 
     // add dataset_id back to the response
@@ -223,17 +251,27 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
           try {
             JSONObject jsonObject = new JSONObject(jsonString);
             jsonObject.put("dataset_id", datasetId);
-            return jsonObject;
+
+	    // Cache the response
+	    try {
+	      cache.populateCache(cacheKey, message, jsonObject);
+	    } catch (Exception e) {
+	      System.err.println("Warning: Failed to cache response for gene " + geneId + 
+				 " and dataset " + datasetId + ": " + e.getMessage());
+    	    }
+	    
+            return Optional.of(jsonObject);
           } catch (JSONException e) {
-            System.err.println("Error parsing JSON response for dataset " + datasetId + ": " + e.getMessage());
+            System.err.println("Error parsing JSON response for gene " + geneId + " and dataset " + datasetId + ": " + e.getMessage());
             System.err.println("Raw response: " + jsonString);
-            return new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId);
+            JSONObject errorResponse = new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId);
+	    return Optional.of(errorResponse);
           }
         });
   }
 
 
-  private static JSONObject sendExperimentSummariesToOpenAI(List<JSONObject> experiments) {
+  private static Optional<JSONObject> sendExperimentSummariesToOpenAI(String geneId, List<JSONObject> experiments, CacheMode cacheMode) {
 	
     String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
 	    "```json\n%s\n```\n\n".formatted(new JSONArray(experiments)) +
@@ -258,10 +296,10 @@ private static JSONObject sendExperimentSummariesToOpenAI(List<JSONObject> exper
     String jsonString = completion.choices().get(0).message().content().get();
     JSONObject rawResponseObject = new JSONObject(jsonString);
 
-    // TO DO - quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI
+    // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI
     JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments);
 	
-    return finalResponseObject;
+    return Optional.of(finalResponseObject);
   }
 
     

From 5c323c369c1194e16a95cad5e35e1068552bdba1 Mon Sep 17 00:00:00 2001
From: Ryan Doherty <tech@conical.org>
Date: Tue, 18 Feb 2025 16:17:48 -0500
Subject: [PATCH 09/31] a few changes for error handling and to set up caching

---
 .../apicommon/model/report/ai/CacheMode.java  |  24 +--
 .../ai/SingleGeneAiExpressionReporter.java    |  83 ++++++----
 .../report/ai/expression/ExpressionData.java  |  25 +--
 .../ai/expression/GeneRecordProcessor.java    | 150 +++++++++++-------
 .../report/ai/expression/Summarizer.java      |  71 ++++-----
 5 files changed, 192 insertions(+), 161 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java
index d514ca813..110c95967 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java
@@ -1,26 +1,6 @@
 package org.apidb.apicommon.model.report.ai;
 
 public enum CacheMode {
-
-  TEST("test"),
-  POPULATE("populate");
-
-  private final String mode;
-
-  CacheMode(String mode) {
-    this.mode = mode;
-  }
-
-  public String getMode() {
-    return mode;
-  }
-
-  public static CacheMode fromString(String mode) throws IllegalArgumentException {
-    for (CacheMode cm : CacheMode.values()) {
-      if (cm.mode.equalsIgnoreCase(mode)) {
-	return cm;
-      }
-    }
-    throw new IllegalArgumentException("Invalid CacheMode: " + mode);
-  }
+  TEST,
+  POPULATE;
 }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index 9b8a0f336..29cd28036 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -1,65 +1,78 @@
 package org.apidb.apicommon.model.report.ai;
 
-import org.gusdb.wdk.model.report.AbstractReporter;
-import org.gusdb.wdk.model.report.Reporter;
-import org.gusdb.wdk.model.report.ReporterConfigException;
+import java.io.BufferedWriter;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.apidb.apicommon.model.TranscriptUtil;
+import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor;
 import org.apidb.apicommon.model.report.ai.expression.Summarizer;
 import org.gusdb.wdk.model.WdkModelException;
-import org.gusdb.wdk.model.record.RecordClass;
-import org.gusdb.wdk.model.record.RecordInstance;
+import org.gusdb.wdk.model.WdkUserException;
 import org.gusdb.wdk.model.answer.stream.RecordStream;
 import org.gusdb.wdk.model.answer.stream.RecordStreamFactory;
-import org.apidb.apicommon.model.TranscriptUtil;
+import org.gusdb.wdk.model.record.RecordClass;
+import org.gusdb.wdk.model.record.RecordInstance;
 import org.gusdb.wdk.model.record.TableField;
-import org.gusdb.wdk.model.WdkModelException;
-import org.gusdb.wdk.model.WdkUserException;
-
+import org.gusdb.wdk.model.report.AbstractReporter;
+import org.gusdb.wdk.model.report.Reporter;
+import org.gusdb.wdk.model.report.ReporterConfigException;
+import org.json.JSONException;
 import org.json.JSONObject;
-import java.io.IOException;
-import java.io.OutputStream;
-import java.util.Map;
-import java.util.List;
-import java.util.stream.Collectors;
 
-public class SingleGeneAiExpressionReporter extends AbstractReporter {    
+public class SingleGeneAiExpressionReporter extends AbstractReporter {
+
+  private static final int MAX_RESULT_SIZE = 1; // one gene at a time for now
 
   private CacheMode _cacheMode = CacheMode.TEST;
-    
+
   @Override
   public Reporter configure(JSONObject config) throws ReporterConfigException, WdkModelException {
     try {
+      // assign cache mode
       if (config.has("cacheMode")) {
-        _cacheMode = CacheMode.fromString(config.getString("cacheMode"));
+        _cacheMode = CacheMode.valueOf(config.getString("cacheMode").toUpperCase());
+      }
+
+      // check model config; this should only be assigned to genes
+      RecordClass geneRecordClass = TranscriptUtil.getGeneRecordClass(_wdkModel);
+      if (_baseAnswer.getQuestion().getRecordClass() != geneRecordClass) {
+        throw new WdkModelException(SingleGeneAiExpressionReporter.class.getName() +
+            " should only be assigned to " + geneRecordClass.getFullName());
+      }
+
+      // check result size; limit to small results due to OpenAI cost
+      if (_baseAnswer.getResultSizeFactory().getResultSize() > MAX_RESULT_SIZE) {
+        throw new ReporterConfigException("This reporter cannot be called with results of size greater than " + MAX_RESULT_SIZE);
       }
-    } catch (IllegalArgumentException e) {
-	    throw new ReporterConfigException("Invalid cacheMode value: " + config.getString("cacheMode"), e);
+    }
+    catch (JSONException | IllegalArgumentException e) {
+      throw new ReporterConfigException("Invalid cacheMode value: " + config.get("cacheMode"), e);
     }
     return this;
   }
 
   @Override
   protected void write(OutputStream out) throws IOException, WdkModelException {
-    RecordClass geneRecordClass = TranscriptUtil.getGeneRecordClass(_wdkModel);
-    Map<String, TableField> tableFields = geneRecordClass.getTableFieldMap();
-    List<TableField> tables = List.of("ExpressionGraphs", "ExpressionGraphsDataTable").stream()
-      .map(name -> tableFields.get(name))
-      .collect(Collectors.toList());
 
-    try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables)) {
-      RecordInstance singleRecord = recordStream.iterator().next();
-      // we will need to pass `_cacheMode` to `summarizeExpression()`...
-      JSONObject expressionSummary = Summarizer.summarizeExpression(singleRecord, _cacheMode);
-      out.write(expressionSummary.toString().getBytes());
-      out.flush();
+    Map<String, TableField> tableFields = _baseAnswer.getQuestion().getRecordClass().getTableFieldMap();
+    List<TableField> tables = GeneRecordProcessor.REQUIRED_TABLE_NAMES.stream()
+        .map(name -> tableFields.get(name)).collect(Collectors.toList());
+
+    try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables);
+        BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out))) {
+      for (RecordInstance record : recordStream) {
+        JSONObject expressionSummary = Summarizer.summarizeExpression(record, _cacheMode);
+        writer.write(expressionSummary.toString());
+      }
     }
     catch (WdkUserException e) {
       throw new WdkModelException(e);
     }
-
-    
   }
-  
 
 }
-
-
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java
index c2e688878..9d807d770 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java
@@ -4,19 +4,20 @@
 import java.util.List;
 
 public class ExpressionData {
-    private final List<JSONObject> expressionGraphs;
-    private final List<JSONObject> expressionGraphsDataTable;
 
-    public ExpressionData(List<JSONObject> expressionGraphs, List<JSONObject> expressionGraphsDataTable) {
-        this.expressionGraphs = expressionGraphs;
-        this.expressionGraphsDataTable = expressionGraphsDataTable;
-    }
+  private final List<JSONObject> expressionGraphs;
+  private final List<JSONObject> expressionGraphsDataTable;
 
-    public List<JSONObject> getExpressionGraphs() {
-        return expressionGraphs;
-    }
+  public ExpressionData(List<JSONObject> expressionGraphs, List<JSONObject> expressionGraphsDataTable) {
+    this.expressionGraphs = expressionGraphs;
+    this.expressionGraphsDataTable = expressionGraphsDataTable;
+  }
 
-    public List<JSONObject> getExpressionGraphsDataTable() {
-        return expressionGraphsDataTable;
-    }
+  public List<JSONObject> getExpressionGraphs() {
+    return expressionGraphs;
+  }
+
+  public List<JSONObject> getExpressionGraphsDataTable() {
+    return expressionGraphsDataTable;
+  }
 }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index 624216b72..09b30685a 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -1,84 +1,122 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
-import org.gusdb.wdk.model.record.RecordInstance;
-import org.gusdb.wdk.model.record.TableValue;
-import org.gusdb.wdk.model.record.TableValueRow;
-import org.gusdb.wdk.model.WdkUserException;
-import org.gusdb.wdk.model.WdkModelException;
-
-import org.json.JSONArray;
-import org.json.JSONObject;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Set;
 import java.util.stream.Collectors;
-import java.util.stream.StreamSupport;
+
+import org.gusdb.fgputil.EncryptionUtil;
+import org.gusdb.fgputil.json.JsonUtil;
+import org.gusdb.wdk.model.WdkModelException;
+import org.gusdb.wdk.model.WdkUserException;
+import org.gusdb.wdk.model.record.RecordInstance;
+import org.gusdb.wdk.model.record.TableValue;
+import org.gusdb.wdk.model.record.TableValueRow;
+import org.json.JSONObject;
 
 /**
  * expects a geneRecord with two tables: "ExpressionGraphs" and "ExpressionGraphsDataTable"
  *
  * returns a list of JSON Objects of data ready to feed the AI
  */
-
 public class GeneRecordProcessor {
-  private static final Set<String> KEYS_TO_KEEP =
-    Set.of(
-           "y_axis", "description", "genus_species", "project_id", "summary", "dataset_id",
-           "assay_type", "x_axis", "module", "dataset_name", "display_name", "short_attribution", "paralog_number"
-           );
 
-  public static List<JSONObject> processExpressionData(RecordInstance geneRecord) throws WdkModelException, WdkUserException {
+  private static final Set<String> KEYS_TO_KEEP = Set.of("y_axis", "description", "genus_species",
+      "project_id", "summary", "dataset_id", "assay_type", "x_axis", "module", "dataset_name", "display_name",
+      "short_attribution", "paralog_number");
+
+  private static final String EXPRESSION_GRAPH_TABLE = "ExpressionGraphs";
+  private static final String EXPRESSION_GRAPH_DATA_TABLE = "ExpressionGraphsDataTable";
+
+  public static final List<String> REQUIRED_TABLE_NAMES = List.of(EXPRESSION_GRAPH_TABLE, EXPRESSION_GRAPH_DATA_TABLE);
+
+  public interface GeneSummaryInputs {
+
+    String getGeneId();
+
+    List<JSONObject> getExperimentsWithData();
+
+    default String getExperimentsDigest() {
+      return EncryptionUtil.md5(getExperimentsWithData().stream()
+          .map(JsonUtil::serialize).collect(Collectors.joining()));
+    }
+  }
+
+  public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record) throws WdkModelException {
+    String geneId = record.getPrimaryKey().getValues().get("gene_source_id");
+    List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(record);
+    return new GeneSummaryInputs() {
+      @Override
+      public String getGeneId() {
+        return geneId;
+      }
+      @Override
+      public List<JSONObject> getExperimentsWithData() {
+        return experimentsWithData;
+      }
+    };
+  }
+
+  static List<JSONObject> processExpressionData(RecordInstance geneRecord)
+      throws WdkModelException {
     return processExpressionData(geneRecord, 0);
   }
 
   // for debugging only
-  public static List<JSONObject> processExpressionData(RecordInstance geneRecord, String datasetId) throws WdkModelException, WdkUserException {
+  static List<JSONObject> processExpressionData(RecordInstance geneRecord, String datasetId) throws WdkModelException {
     List<JSONObject> experiments = processExpressionData(geneRecord, 0);
-    return experiments.stream()
-      .filter(experiment -> datasetId.equals(experiment.getString("dataset_id")))
-      .collect(Collectors.toList());
+    return experiments.stream().filter(
+        experiment -> datasetId.equals(experiment.getString("dataset_id"))).collect(Collectors.toList());
   }
 
   // maxExperiments is for dev/debugging only
-  public static List<JSONObject> processExpressionData(RecordInstance geneRecord, int maxExperiments) throws WdkModelException, WdkUserException {
-    // return value:
-    List<JSONObject> experiments = new ArrayList<>();
-
-    TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs");
-    TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable");
-
-    for (TableValueRow experimentRow : expressionGraphs) {
-      JSONObject experimentInfo = new JSONObject();
-    
-      // Extract all relevant attributes
-      for (String key : KEYS_TO_KEEP) {
-        experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue());
-      }
-
-      List<JSONObject> filteredData = new ArrayList<>();
-      String datasetId = experimentRow.getAttributeValue("dataset_id").getValue();
-      // add data from `expressionGraphsDataTable` where attribute "dataset_id" equals `datasetId`
-      // (this would be more efficient with a `Map<String, List<TableValueRow>>` made before the `expressionGraphs` loop)
-      List<TableValueRow> thisExperimentDataRows = new ArrayList<>();
-      for (TableValueRow dataRow : expressionGraphsDataTable) {
-        if (dataRow.getAttributeValue("dataset_id").getValue().equals(datasetId)) {
-          JSONObject dataEntry = new JSONObject();
-
-          // Extract relevant numeric fields
-          List<String> dataKeys = List.of("value", "standard_error", "percentile_channel1", "percentile_channel2", "sample_name");
-          for (String key : dataKeys) {
-            dataEntry.put(key, dataRow.getAttributeValue(key).getValue());
+  static List<JSONObject> processExpressionData(RecordInstance geneRecord, int maxExperiments)
+      throws WdkModelException {
+    try {
+      // return value:
+      List<JSONObject> experiments = new ArrayList<>();
+  
+      TableValue expressionGraphs = geneRecord.getTableValue("ExpressionGraphs");
+      TableValue expressionGraphsDataTable = geneRecord.getTableValue("ExpressionGraphsDataTable");
+  
+      for (TableValueRow experimentRow : expressionGraphs) {
+        JSONObject experimentInfo = new JSONObject();
+  
+        // Extract all relevant attributes
+        for (String key : KEYS_TO_KEEP) {
+          experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue());
+        }
+  
+        List<JSONObject> filteredData = new ArrayList<>();
+        String datasetId = experimentRow.getAttributeValue("dataset_id").getValue();
+        // add data from `expressionGraphsDataTable` where attribute "dataset_id" equals `datasetId`
+        // (this would be more efficient with a `Map<String, List<TableValueRow>>` made before the
+        // `expressionGraphs` loop)
+        for (TableValueRow dataRow : expressionGraphsDataTable) {
+          if (dataRow.getAttributeValue("dataset_id").getValue().equals(datasetId)) {
+            JSONObject dataEntry = new JSONObject();
+  
+            // Extract relevant numeric fields
+            List<String> dataKeys = List.of("value", "standard_error", "percentile_channel1",
+                "percentile_channel2", "sample_name");
+            for (String key : dataKeys) {
+              dataEntry.put(key, dataRow.getAttributeValue(key).getValue());
+            }
+  
+            filteredData.add(dataEntry);
           }
-
-          filteredData.add(dataEntry);
         }
+  
+        experimentInfo.put("data", filteredData);
+        experiments.add(experimentInfo);
+  
+        if (maxExperiments > 0 && experiments.size() >= maxExperiments)
+          break;
       }
-
-      experimentInfo.put("data", filteredData);
-      experiments.add(experimentInfo);
-    
-      if (maxExperiments > 0 && experiments.size() >= maxExperiments) break;
+      return experiments;
+    }
+    catch (WdkUserException e) {
+      throw new WdkModelException(e.getMessage());
     }
-    return experiments;
   }
 }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 009be461a..4b78d6c58 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -1,32 +1,32 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
-import org.apidb.apicommon.model.report.ai.CacheMode;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.stream.Collectors;
 
-import org.gusdb.wdk.model.record.RecordInstance;
-import org.gusdb.wdk.model.WdkUserException;
+import org.apidb.apicommon.model.report.ai.CacheMode;
 import org.gusdb.wdk.model.WdkModelException;
-
-import org.json.JSONObject;
+import org.gusdb.wdk.model.WdkUserException;
+import org.gusdb.wdk.model.record.RecordInstance;
 import org.json.JSONArray;
 import org.json.JSONException;
+import org.json.JSONObject;
 
 import com.openai.client.OpenAIClientAsync;
 import com.openai.client.okhttp.OpenAIOkHttpClientAsync;
+import com.openai.core.JsonValue;
+import com.openai.models.ChatCompletion;
 import com.openai.models.ChatCompletionCreateParams;
 import com.openai.models.ChatModel;
-import com.openai.models.ChatCompletion;
 import com.openai.models.ResponseFormatJsonSchema;
 import com.openai.models.ResponseFormatJsonSchema.JsonSchema;
-import com.openai.core.JsonValue;
-import java.util.List;
-import java.util.Map;
-import java.util.HashMap;
-import java.util.Set;
-import java.util.HashSet;
-import java.util.concurrent.CompletableFuture;
-import java.util.stream.Collectors;
 
 public class Summarizer {
+
   private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder()
     .fromEnv()  // Uses OPENAI_API_KEY from env
     .maxRetries(32)  // Handle 429 errors
@@ -106,33 +106,34 @@ public class Summarizer {
                                           )
                            )
     .build();
-   
-    public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException  {
-        
+
+  public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException  {
+
     try {
       // Process expression data further into a list of pruned metadata plus data
-	    List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord);
+      List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord);
       System.out.println("Pre-processed Experiments: " + experimentsWithData.size());
-	    
+
       // Send AI requests in parallel
-	    // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI
+      // CACHE OPPORTUNITY ONE - sendExperimentToOpenAI
       List<CompletableFuture<JSONObject>> aiRequests = experimentsWithData.stream()
-        .map(Summarizer::sendExperimentToOpenAI)
-        .collect(Collectors.toList());
+          .map(Summarizer::sendExperimentToOpenAI)
+          .collect(Collectors.toList());
       // Wait for all requests to complete
       List<JSONObject> responses = aiRequests.stream()
-        .map(CompletableFuture::join)  // Blocks until each completes
-        .collect(Collectors.toList());
+          .map(CompletableFuture::join)  // Blocks until each completes
+          .collect(Collectors.toList());
 
       // Debug output
-	    // System.out.println("Individual responses:");
+      // System.out.println("Individual responses:");
       // responses.forEach(response -> System.out.println(response.toString(2)));
-	    // System.exit(0);
-	    
-	    JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses);
-	    return finalSummary;
+      // System.exit(0);
 
-    } catch (WdkModelException e) {
+      JSONObject finalSummary = sendExperimentSummariesToOpenAI(responses);
+      return finalSummary;
+
+    }
+    catch (WdkModelException e) {
       // Handle errors gracefully
       System.err.println("Error fetching expression data: " + e.getMessage());
       throw new WdkUserException(e);
@@ -150,16 +151,14 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
     //
     // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below.
 
-
-	
     // We don't need to send the dataset_id to the AI but it's useful to have in the
     // response for phase two - so we save it for later
     JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
     String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null;
     experimentForAI.remove("dataset_id");
 	
-    String message = "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" +
-	    "```json\n%s\n```\n\n".formatted(experimentForAI.toString()) +
+    String message = String.format("The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" +
+	    "```json\n%s\n```\n\n", experimentForAI.toString()) +
 	    "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" +
 	    "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" +
 	    "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields.";
@@ -200,8 +199,8 @@ private static CompletableFuture<JSONObject> sendExperimentToOpenAI(JSONObject e
 
   private static JSONObject sendExperimentSummariesToOpenAI(List<JSONObject> experiments) {
 	
-    String message = "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
-	    "```json\n%s\n```\n\n".formatted(new JSONArray(experiments)) +
+    String message = String.format("Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
+	    "```json\n%s\n```\n\n", new JSONArray(experiments).toString()) +
 	    "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.";
 
     ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()

From 0e6b9451980d7e5b46759c035d63be4714ae67ba Mon Sep 17 00:00:00 2001
From: Ryan Doherty <tech@conical.org>
Date: Thu, 20 Feb 2025 11:14:26 -0500
Subject: [PATCH 10/31] Massage class roles

---
 .../apicommon/model/report/ai/CacheMode.java  |   6 -
 .../ai/SingleGeneAiExpressionReporter.java    |  52 ++-
 .../ai/expression/AiExpressionCache.java      | 112 ++++---
 .../report/ai/expression/ExpressionData.java  |  23 --
 .../ai/expression/GeneRecordProcessor.java    |  85 +++--
 .../report/ai/expression/Summarizer.java      | 297 +++++++++---------
 6 files changed, 325 insertions(+), 250 deletions(-)
 delete mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java
 delete mode 100644 Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java
deleted file mode 100644
index 110c95967..000000000
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/CacheMode.java
+++ /dev/null
@@ -1,6 +0,0 @@
-package org.apidb.apicommon.model.report.ai;
-
-public enum CacheMode {
-  TEST,
-  POPULATE;
-}
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index 29cd28036..ee0eb86eb 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -9,8 +9,10 @@
 import java.util.stream.Collectors;
 
 import org.apidb.apicommon.model.TranscriptUtil;
+import org.apidb.apicommon.model.report.ai.expression.AiExpressionCache;
 import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor;
 import org.apidb.apicommon.model.report.ai.expression.Summarizer;
+import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs;
 import org.gusdb.wdk.model.WdkModelException;
 import org.gusdb.wdk.model.WdkUserException;
 import org.gusdb.wdk.model.answer.stream.RecordStream;
@@ -28,15 +30,15 @@ public class SingleGeneAiExpressionReporter extends AbstractReporter {
 
   private static final int MAX_RESULT_SIZE = 1; // one gene at a time for now
 
-  private CacheMode _cacheMode = CacheMode.TEST;
+  private static final String POPULATION_MODE_PROP_KEY = "populateIfNotPresent";
+
+  private boolean _populateIfNotPresent;
 
   @Override
   public Reporter configure(JSONObject config) throws ReporterConfigException, WdkModelException {
     try {
       // assign cache mode
-      if (config.has("cacheMode")) {
-        _cacheMode = CacheMode.valueOf(config.getString("cacheMode").toUpperCase());
-      }
+      _populateIfNotPresent = config.optBoolean(POPULATION_MODE_PROP_KEY, false);
 
       // check model config; this should only be assigned to genes
       RecordClass geneRecordClass = TranscriptUtil.getGeneRecordClass(_wdkModel);
@@ -59,19 +61,53 @@ public Reporter configure(JSONObject config) throws ReporterConfigException, Wdk
   @Override
   protected void write(OutputStream out) throws IOException, WdkModelException {
 
+    // get table fields needed to produce summary inputs
     Map<String, TableField> tableFields = _baseAnswer.getQuestion().getRecordClass().getTableFieldMap();
     List<TableField> tables = GeneRecordProcessor.REQUIRED_TABLE_NAMES.stream()
         .map(name -> tableFields.get(name)).collect(Collectors.toList());
 
+    // open summary cache (manages persistence of expression data)
+    AiExpressionCache cache = AiExpressionCache.getInstance(_wdkModel);
+
+    // create summarizer (interacts with OpenAI)
+    Summarizer summarizer = new Summarizer(_wdkModel);
+
+    // open record and output streams
     try (RecordStream recordStream = RecordStreamFactory.getRecordStream(_baseAnswer, List.of(), tables);
         BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(out))) {
+
+      // write a JSON object with gene ID keys and expression summary values
+      writer.write("{");
+      boolean firstRecord = true;
       for (RecordInstance record : recordStream) {
-        JSONObject expressionSummary = Summarizer.summarizeExpression(record, _cacheMode);
-        writer.write(expressionSummary.toString());
+
+        // create summary inputs
+        GeneSummaryInputs summaryInputs = GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer::getExperimentMessage);
+
+        // fetch summary, producing if necessary and requested
+        JSONObject expressionSummary = _populateIfNotPresent
+            ? getSummary(summaryInputs, summarizer, cache)
+            : readSummary(summaryInputs, cache);
+
+        // join entries with commas
+        if (firstRecord) firstRecord = false; else writer.write(",");
+
+        // write JSON object
+        writer.write("\"" + summaryInputs.getGeneId() + "\":" + expressionSummary.toString());
+
       }
     }
-    catch (WdkUserException e) {
-      throw new WdkModelException(e);
+  }
+
+  private JSONObject getSummary(GeneSummaryInputs summaryInputs, Summarizer summarizer, AiExpressionCache cache) {
+    try {
+      
+    }
+  }
+
+  private JSONObject readSummary(GeneSummaryInputs summaryInputs, AiExpressionCache cache) {
+    try {
+      
     }
   }
 
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
index 2a9291dc4..1b48ea698 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -1,75 +1,109 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
 import java.io.IOException;
+import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
-import java.nio.file.Files;
-import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
-import java.util.function.Function;
-import org.json.JSONObject;
-import org.json.JSONException;
+import java.util.Optional;
+import java.util.Set;
 
+import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs;
 import org.gusdb.fgputil.cache.disk.OnDiskCache;
+import org.gusdb.fgputil.cache.disk.OnDiskCache.EntryNotCreatedException;
+import org.gusdb.fgputil.cache.disk.OnDiskCache.Overwrite;
 import org.gusdb.fgputil.functional.FunctionalInterfaces.ConsumerWithException;
 import org.gusdb.fgputil.functional.FunctionalInterfaces.FunctionWithException;
+import org.gusdb.wdk.model.WdkModel;
+import org.json.JSONException;
+import org.json.JSONObject;
 
-public class AiExpressionCache extends OnDiskCache {
+public class AiExpressionCache {
+
+  // constants to determine cache location
+  private static final String CACHE_DIR_PROP_NAME = "AI_EXPRESSION_CACHE_DIR";
+  private static final String DEFAULT_TMP_CACHE_SUBDIR = "expressionCache";
 
-  // Default cache location and timing settings
-  private static final Path DEFAULT_CACHE_DIR = Paths.get("/tmp/expressionCache");
   private static final long DEFAULT_TIMEOUT_MILLIS = 5000;
   private static final long DEFAULT_POLL_FREQUENCY_MILLIS = 500;
 
-  // No-argument constructor using defaults
-  public AiExpressionCache() throws IOException {
-    super(DEFAULT_CACHE_DIR, DEFAULT_TIMEOUT_MILLIS, DEFAULT_POLL_FREQUENCY_MILLIS);
+  // singleton pattern
+  private static AiExpressionCache _instance;
+
+  public static synchronized AiExpressionCache getInstance(WdkModel wdkModel) throws IOException {
+    if (_instance == null) {
+      _instance = new AiExpressionCache(wdkModel);
+    }
+    else if (_instance._wdkModel != wdkModel) {
+      // callers should always use the same model
+      throw new IllegalStateException("Attempt to get instance with different model than previously used.");
+    }
+    return _instance;
   }
 
-  // Check if cached data is valid
-  public boolean isCacheValid(String cacheKey, String inputData) {
+  private final WdkModel _wdkModel;
+  private final OnDiskCache _cache;
 
+  public AiExpressionCache(WdkModel wdkModel) throws IOException {
+    _wdkModel = wdkModel;
+
+    Path cacheParentDir = Optional
+        .ofNullable(_wdkModel.getProperties().get(CACHE_DIR_PROP_NAME))
+        .map(Paths::get)
+        .orElse(Paths.get(_wdkModel.getModelConfig().getWdkTempDir().toString(), DEFAULT_TMP_CACHE_SUBDIR));
+
+    _cache = new OnDiskCache(cacheParentDir, DEFAULT_TIMEOUT_MILLIS, DEFAULT_POLL_FREQUENCY_MILLIS);
+
+  }
+
+  public void blah() {
+    _cache.populateAndProcessContent(geneId, populator, visitor, overwritePredicate)
+  }
+
+  // Check if cached data is valid
+  public boolean isCacheValid(GeneSummaryInputs summaryInputs) {
     try {
       FunctionWithException<Path, Boolean> visitor = entryDir -> {
-	  Path digestFile = entryDir.resolve("digest.txt");
-
-	  if (!Files.exists(digestFile)) {
-	    System.out.println("No digest file found.");
-	    return false;
-	  }
-
-	  // Read stored digest and compare
-	  String cachedDigest = Files.readString(digestFile);
-	  String computedDigest = computeDigest(inputData);
-
-	  if (cachedDigest.equals(computedDigest)) {
-	    System.out.println("Cache digest matches input.");
-	    return true;
-	  } else {
-	    System.out.println("Cache digest mismatch! Cache is out of date.");
-	    return false;
-	  }
+        Path digestFile = entryDir.resolve("digest.txt");
+
+        if (!Files.exists(digestFile)) {
+          System.out.println("No digest file found.");
+          return false;
+        }
+
+        // Read stored digest and compare
+        String cachedDigest = Files.readString(digestFile);
+
+        if (cachedDigest.equals(summaryInputs.getExperimentsDigest())) {
+          System.out.println("Cache digest matches input.");
+          return true;
+        }
+        else {
+          System.out.println("Cache digest mismatch! Cache is out of date.");
+          return false;
+        }
       };
 
-      return visitContent(cacheKey, visitor);
+      return _cache.visitContent(summaryInputs.getGeneId(), visitor);
 
-    } catch (EntryNotCreatedException e) {
+    }
+    catch (EntryNotCreatedException e) {
       System.out.println("Cache entry does not exist yet.");
       return false;
-    } catch (Exception e) {
+    }
+    catch (Exception e) {
       throw new RuntimeException("Error validating cache entry", e);
     }
   }
 
   // Populate cache with computed data (Method 1: Takes computedData directly)
-  public void populateCache(String cacheKey, String inputData, JSONObject computedData) throws Exception {
+  public void populateCache(GeneSummaryInputs summaryInputs, JSONObject computedData) throws Exception {
     ConsumerWithException<Path> populator = entryDir -> {
       Files.writeString(entryDir.resolve("cached_data.txt"), computedData.toString());
-      Files.writeString(entryDir.resolve("digest.txt"), computeDigest(inputData));
+      Files.writeString(entryDir.resolve("digest.txt"), summaryInputs.getExperimentsDigest());
     };
 
     // Populate with overwrite policy (assumes caller ensures it's necessary)
-    populateAndProcessContent(cacheKey, populator, path -> null, Overwrite.YES);
+    _cache.populateAndProcessContent(summaryInputs.getGeneId(), populator, path -> null, Overwrite.YES);
   }
 
 //  // Populate cache with a function that computes the result (Method 2: Uses a function)
@@ -94,7 +128,7 @@ public JSONObject readCachedData(String cacheKey) throws Exception {
       }
     };
     
-    return visitContent(cacheKey, visitor);
+    return _cache.visitContent(cacheKey, visitor);
   }
 
 }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java
deleted file mode 100644
index 9d807d770..000000000
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/ExpressionData.java
+++ /dev/null
@@ -1,23 +0,0 @@
-package org.apidb.apicommon.model.report.ai.expression;
-
-import org.json.JSONObject;
-import java.util.List;
-
-public class ExpressionData {
-
-  private final List<JSONObject> expressionGraphs;
-  private final List<JSONObject> expressionGraphsDataTable;
-
-  public ExpressionData(List<JSONObject> expressionGraphs, List<JSONObject> expressionGraphsDataTable) {
-    this.expressionGraphs = expressionGraphs;
-    this.expressionGraphsDataTable = expressionGraphsDataTable;
-  }
-
-  public List<JSONObject> getExpressionGraphs() {
-    return expressionGraphs;
-  }
-
-  public List<JSONObject> getExpressionGraphsDataTable() {
-    return expressionGraphsDataTable;
-  }
-}
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index fce627768..807cec0f3 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -1,8 +1,11 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
 import java.util.ArrayList;
+import java.util.LinkedHashMap;
 import java.util.List;
+import java.util.Map;
 import java.util.Set;
+import java.util.function.Function;
 import java.util.stream.Collectors;
 
 import org.gusdb.fgputil.EncryptionUtil;
@@ -30,53 +33,61 @@ public class GeneRecordProcessor {
 
   public static final List<String> REQUIRED_TABLE_NAMES = List.of(EXPRESSION_GRAPH_TABLE, EXPRESSION_GRAPH_DATA_TABLE);
 
+  public interface ExperimentInputs {
+
+    String getCacheKey();
+
+    String getDigest();
+
+    JSONObject getExperimentData();
+  }
+
   public interface GeneSummaryInputs {
 
-    String getGeneId();
+    String getGeneId(); // is the cache key
 
-    List<JSONObject> getExperimentsWithData();
+    Map<String,ExperimentInputs> getExperimentsWithData();
 
     default String getExperimentsDigest() {
-      return EncryptionUtil.md5(getExperimentsWithData().stream()
-          .map(JsonUtil::serialize).collect(Collectors.joining()));
+      // TODO Does it make more sense to md5 the concatenation of the experiment hashes?
+      return EncryptionUtil.md5(getExperimentsWithData().values().stream()
+          .map(ExperimentInputs::getExperimentData)
+          .map(JsonUtil::serialize)
+          .collect(Collectors.joining()));
     }
   }
 
-  public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record) throws WdkModelException {
-    String geneId = record.getPrimaryKey().getValues().get("gene_source_id");
-    List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(record);
+  private static String getGeneId(RecordInstance record) {
+    return record.getPrimaryKey().getValues().get("gene_source_id");
+  }
+
+  public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, Function<JSONObject, String> experimentDigester) throws WdkModelException {
+
+    String geneId = getGeneId(record);
+
+    Map<String,ExperimentInputs> experimentsWithData = GeneRecordProcessor.processExpressionData(record, experimentDigester, 0);
+
     return new GeneSummaryInputs() {
       @Override
       public String getGeneId() {
         return geneId;
       }
+
       @Override
-      public List<JSONObject> getExperimentsWithData() {
+      public Map<String,ExperimentInputs> getExperimentsWithData() {
         return experimentsWithData;
       }
     };
   }
 
-  static List<JSONObject> processExpressionData(RecordInstance geneRecord)
-      throws WdkModelException {
-    return processExpressionData(geneRecord, 0);
-  }
-
-  // for debugging only
-  static List<JSONObject> processExpressionData(RecordInstance geneRecord, String datasetId) throws WdkModelException {
-    List<JSONObject> experiments = processExpressionData(geneRecord, 0);
-    return experiments.stream().filter(
-        experiment -> datasetId.equals(experiment.getString("dataset_id"))).collect(Collectors.toList());
-  }
-
-  // maxExperiments is for dev/debugging only
-  static List<JSONObject> processExpressionData(RecordInstance geneRecord, int maxExperiments) throws WdkModelException {
+  private static Map<String, ExperimentInputs> processExpressionData(RecordInstance record, Function<JSONObject, String> getExperimentPrompt, int maxExperiments) throws WdkModelException {
     try {
       // return value:
-      List<JSONObject> experiments = new ArrayList<>();
+      Map<String, ExperimentInputs> experiments = new LinkedHashMap<>();
 
-      TableValue expressionGraphs = geneRecord.getTableValue(EXPRESSION_GRAPH_TABLE);
-      TableValue expressionGraphsDataTable = geneRecord.getTableValue(EXPRESSION_GRAPH_DATA_TABLE);
+      String geneId = getGeneId(record);
+      TableValue expressionGraphs = record.getTableValue(EXPRESSION_GRAPH_TABLE);
+      TableValue expressionGraphsDataTable = record.getTableValue(EXPRESSION_GRAPH_DATA_TABLE);
 
       for (TableValueRow experimentRow : expressionGraphs) {
 
@@ -87,11 +98,28 @@ static List<JSONObject> processExpressionData(RecordInstance geneRecord, int max
           experimentInfo.put(key, experimentRow.getAttributeValue(key).getValue());
         }
 
-        List<JSONObject> filteredData = readFilteredData(
-            experimentRow.getAttributeValue("dataset_id").getValue(), expressionGraphsDataTable); 
+        String datasetId = experimentRow.getAttributeValue("dataset_id").getValue();
+
+        List<JSONObject> filteredData = readFilteredData(datasetId, expressionGraphsDataTable); 
 
         experimentInfo.put("data", filteredData);
-        experiments.add(experimentInfo);
+
+        experiments.put(datasetId, new ExperimentInputs() {
+          @Override
+          public String getCacheKey() {
+            return geneId + ':' + datasetId;
+          }
+
+          @Override
+          public String getDigest() {
+            return EncryptionUtil.md5(getExperimentPrompt.apply(getExperimentData()));
+          }
+
+          @Override
+          public JSONObject getExperimentData() {
+            return experimentInfo;
+          }
+        });
 
         if (maxExperiments > 0 && experiments.size() >= maxExperiments)
           break;
@@ -122,4 +150,5 @@ private static List<JSONObject> readFilteredData(String datasetId, TableValue ex
     }
     return filteredData;
   }
+
 }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 26b1c3293..865d78457 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -11,7 +11,10 @@
 import java.util.concurrent.CompletableFuture;
 import java.util.stream.Collectors;
 
-import org.apidb.apicommon.model.report.ai.CacheMode;
+import org.apache.log4j.Logger;
+import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs;
+import org.gusdb.fgputil.json.JsonUtil;
+import org.gusdb.wdk.model.WdkModel;
 import org.gusdb.wdk.model.WdkModelException;
 import org.gusdb.wdk.model.WdkUserException;
 import org.gusdb.wdk.model.record.RecordInstance;
@@ -30,26 +33,12 @@
 
 public class Summarizer {
 
-  private static final OpenAIClientAsync openAIClient = OpenAIOkHttpClientAsync.builder()
-    .fromEnv()  // Uses OPENAI_API_KEY from env
-    .maxRetries(32)  // Handle 429 errors
-    .build();
-
-  private static final AiExpressionCache cache;
+  private static final Logger LOG = Logger.getLogger(Summarizer.class);
 
-  static {
-    AiExpressionCache tempCache = null;
-    try {
-      tempCache = new AiExpressionCache();
-    } catch (IOException e) {
-      throw new RuntimeException("Failed to initialize AiExpressionCache", e);
-    }
-    cache = tempCache;
-  }
-  
   // provide exact model number for semi-reproducibility
-  private static final ChatModel model = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
-  private static int MAX_RESPONSE_TOKENS = 10000;
+  public static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
+
+  private static final int MAX_RESPONSE_TOKENS = 10000;
     
   private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data";
 
@@ -122,63 +111,74 @@ public class Summarizer {
                            )
     .build();
 
-  public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMode cacheMode) throws WdkUserException  {
+  private static final String OPENAI_API_KEY_PROP_NAME = "OPENAI_API_KEY";
+
+  private final OpenAIClientAsync _openAIClient;
+
+  public Summarizer(WdkModel wdkModel) {
+    _openAIClient = OpenAIOkHttpClientAsync.builder()
+        .apiKey(wdkModel.getProperties().get(OPENAI_API_KEY_PROP_NAME))
+        .maxRetries(32)  // Handle 429 errors
+        .build();
+  }
+
+  public JSONObject summarizeExpression(GeneSummaryInputs summaryInputs, boolean populateIfNotPresent)
+      throws WdkUserException {
 
     try {
-      String geneId = geneRecord.getAttributeValue("gene_id").getValue();
 
       // Process expression data further into a list of pruned metadata plus data
-      List<JSONObject> experimentsWithData = GeneRecordProcessor.processExpressionData(geneRecord);
+      List<JSONObject> experimentsWithData = summaryInputs.getExperimentsWithData();
+      String geneId = summaryInputs.getGeneId();
       System.out.println("Pre-processed Experiments: " + experimentsWithData.size());
 
-	    
       // TEST Mode: Collect valid cache entries
-      if (cacheMode == CacheMode.TEST) {
-	List<JSONObject> cachedResponses = new ArrayList<>();
-	
-	for (JSONObject experiment : experimentsWithData) {
-	  
-	  Optional<JSONObject> experimentSummary = Summarizer.sendExperimentToOpenAI(geneId, experiment, CacheMode.TEST).join();
-
-	  if (experimentSummary.isPresent()) {
+      if (!populateIfNotPresent) {
+        List<JSONObject> cachedResponses = new ArrayList<>();
+
+        for (JSONObject experiment : experimentsWithData) {
+
+          Optional<JSONObject> experimentSummary = sendExperimentToOpenAI(geneId, experiment, populateIfNotPresent).join();
+
+          if (experimentSummary.isPresent()) {
             cachedResponses.add(experimentSummary.get());
-	  } else {
+          }
+          else {
             return new JSONObject().put("cacheStatus", "miss"); // If any cache entry is missing, return early
-	  }
-	}
-
-	// All experiment-level caches are valid, now check final summary cache
-	Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, CacheMode.TEST);
-	return finalSummary
-	  .map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary))
-	  .orElseGet(() -> new JSONObject().put("cacheStatus", "miss"));
-      }
+          }
+        }
 
+        // All experiment-level caches are valid, now check final summary cache
+        Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, populateIfNotPresent);
+
+        return finalSummary.map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary",
+            summary)).orElseGet(() -> new JSONObject().put("cacheStatus", "miss"));
+      }
 
       // Send AI requests in parallel
       List<CompletableFuture<Optional<JSONObject>>> aiRequests = experimentsWithData.stream()
-	// TO DO - potentially some optimisation?
-	// .map(exp -> CompletableFuture.supplyAsync(() -> sendExperimentToOpenAI(geneId, exp, CacheMode.POPULATE)))
-        .map(exp -> sendExperimentToOpenAI(geneId, exp, CacheMode.POPULATE))
-        .collect(Collectors.toList());
+          // TO DO - potentially some optimisation?
+          // .map(exp -> CompletableFuture.supplyAsync(() -> sendExperimentToOpenAI(geneId, exp,
+          // CacheMode.POPULATE)))
+          .map(exp -> sendExperimentToOpenAI(geneId, exp, populateIfNotPresent)).collect(Collectors.toList());
       // Wait for all requests to complete with `join`
-      List<JSONObject> responses = aiRequests.stream()
-	.map(CompletableFuture::join)  // Get Optional<JSONObject>
-	.filter(Optional::isPresent)   // Keep only non-empty results
-	.map(Optional::get)            // Extract JSONObject
-	.collect(Collectors.toList());
+      List<JSONObject> responses = aiRequests.stream().map(CompletableFuture::join) // Get
+                                                                                    // Optional<JSONObject>
+          .filter(Optional::isPresent) // Keep only non-empty results
+          .map(Optional::get) // Extract JSONObject
+          .collect(Collectors.toList());
 
       // Debug output
       // System.out.println("Individual responses:");
       // responses.forEach(response -> System.out.println(response.toString(2)));
       // System.exit(0);
 
-	    
-      Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, CacheMode.POPULATE);
-      return finalSummary
-	.map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary))
-	.orElseGet(() -> new JSONObject().put("cacheStatus", "miss"));
-    } catch (WdkModelException e) {
+      Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, populateIfNotPresent);
+      return finalSummary.map(
+          summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary)).orElseGet(
+              () -> new JSONObject().put("cacheStatus", "miss"));
+    }
+    catch (WdkModelException e) {
 
       // Handle errors gracefully
       System.err.println("Error fetching expression data: " + e.getMessage());
@@ -186,125 +186,127 @@ public static JSONObject summarizeExpression(RecordInstance geneRecord, CacheMod
     }
   }
 
-  
-  private static CompletableFuture<Optional<JSONObject>> sendExperimentToOpenAI(String geneId, JSONObject experiment, CacheMode cacheMode) {
+  public static String getExperimentMessage(JSONObject experiment) {
 
     // Possible TO DO: AI EDIT DESCRIPTION
     // Before sending the experiment+data to the AI, ask the AI to edit the `description` field
     // as follows: (This should be cached by dataset_id only and would be called once per organism
     // and would reduce tokens and "cognitive load" a little bit for the next step.)
     //
-    // "Edit the following text to so that it **only** describes the experimental design of the transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed, especially not any genes or groups of genes and their expression behaviour."
+    // "Edit the following text to so that it **only** describes the experimental design of the
+    // transcriptomics part of the study. Do not mention the results of any bioinformatics analyses performed,
+    // especially not any genes or groups of genes and their expression behaviour."
     //
-    // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt below.
+    // We would then be able to remove the "Ignore all discussion of individual or groups of genes in the
+    // experiment `description`, as this is irrelevant to the gene you are summarising." from the prompt
+    // below.
 
-	
     // We don't need to send dataset_id to the AI but it's useful to have it
     // in the response for phase two
     JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
-    String datasetId = experimentForAI.has("dataset_id") ? experimentForAI.getString("dataset_id") : null;
     experimentForAI.remove("dataset_id");
-    
-    String message = String.format("The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" +
-	    "```json\n%s\n```\n\n", experimentForAI.toString()) +
-	    "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" +
-	    "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" +
-	    "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields.";
 
-    //   System.out.println(message); /// DEBUG
+    return
+        "The JSON below contains expression data for a single gene within a specific experiment, along with relevant experimental and bioinformatics metadata:\n\n" +
+        String.format("```json\n%s\n```\n\n", JsonUtil.serialize(experimentForAI)) +
+        "**Task**: In one sentence, summarize how this gene is expressed in the given experiment. Do not describe the experiment itself—focus on whether the gene is, or is not, substantially and/or significantly upregulated or downregulated with respect to the experimental conditions tested. Take extreme care to assert the correct directionality of the response, especially in experiments with only one or two samples. Additionally, estimate the biological importance of this profile relative to other experiments on an integer scale of 0 (lowest, no differential expression) to 5 (highest, marked differential expression), even though specific comparative data has not been included. Also estimate your confidence (also 0 to 5) in making the estimate and add optional notes if there are peculiarities or caveats that may aid interpretation and further analysis. Finally, provide some general experiment-based keywords that provide a bit more context to the gene-based expression summary.\n" +
+        "**Purpose**: The one-sentence summary will be displayed to users in tabular form on our gene-page. Please wrap user-facing species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers. The notes, scores and keywords will not be shown to users, but will be passed along with the summary to a second AI summarisation step that synthesizes insights from multiple experiments.\n" +
+        "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields.";
+  }
+
+  private CompletableFuture<Optional<JSONObject>> sendExperimentToOpenAI(String geneId,
+      JSONObject experiment, boolean populateIfNotPresent) {
 
     String cacheKey = geneId + ':' + datasetId;
 
-    if (cache.isCacheValid(cacheKey, message)) {
+    if (_cache.isCacheValid(cacheKey, message)) {
       try {
-	JSONObject cachedResponse = cache.readCachedData(cacheKey);
-	return CompletableFuture.completedFuture(Optional.of(cachedResponse));
-      } catch (Exception e) {
-	System.err.println("Cache read failed for key " + cacheKey + ": " + e.getMessage());
-            
-	if (cacheMode == CacheMode.TEST) {
-	  return CompletableFuture.completedFuture(Optional.empty()); // Treat as cache miss
-	}
-	// Else, log and fall through to AI generation
+        JSONObject cachedResponse = cache.readCachedData(cacheKey);
+        return CompletableFuture.completedFuture(Optional.of(cachedResponse));
+      }
+      catch (Exception e) {
+        System.err.println("Cache read failed for key " + cacheKey + ": " + e.getMessage());
+
+        if (!populateIfNotPresent) {
+          return CompletableFuture.completedFuture(Optional.empty()); // Treat as cache miss
+        }
+        // Else, log and fall through to AI generation
       }
-    } else if (cacheMode == CacheMode.TEST) {
+    }
+    else if (!populateIfNotPresent) {
       return CompletableFuture.completedFuture(Optional.empty());
     }
 
-    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
-      .model(model)
-      .maxCompletionTokens(MAX_RESPONSE_TOKENS)
-      .responseFormat(ResponseFormatJsonSchema.builder()
-                      .jsonSchema(JsonSchema.builder()
-                                  .name("experiment-summary")
-                                  .schema(experimentResponseSchema)
-                                  .build())
-                      .build())
-      .addSystemMessage(systemMessage)
-      .addUserMessage(message)
-      .build();
+    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder().model(
+        OPENAI_CHAT_MODEL).maxCompletionTokens(MAX_RESPONSE_TOKENS).responseFormat(
+            ResponseFormatJsonSchema.builder().jsonSchema(
+                JsonSchema.builder().name("experiment-summary").schema(
+                    experimentResponseSchema).build()).build()).addSystemMessage(
+                        systemMessage).addUserMessage(message).build();
 
     // add dataset_id back to the response
-    return openAIClient.chat().completions().create(request)
-	    .thenApply(completion -> {
-          // response is a JSON string
-          String jsonString = completion.choices().get(0).message().content().get();
-          try {
-            JSONObject jsonObject = new JSONObject(jsonString);
-            jsonObject.put("dataset_id", datasetId);
-
-	    // Cache the response
-	    try {
-	      cache.populateCache(cacheKey, message, jsonObject);
-	    } catch (Exception e) {
-	      System.err.println("Warning: Failed to cache response for gene " + geneId + 
-				 " and dataset " + datasetId + ": " + e.getMessage());
-    	    }
-	    
-            return Optional.of(jsonObject);
-          } catch (JSONException e) {
-            System.err.println("Error parsing JSON response for gene " + geneId + " and dataset " + datasetId + ": " + e.getMessage());
-            System.err.println("Raw response: " + jsonString);
-            JSONObject errorResponse = new JSONObject().put("error", "Invalid JSON response").put("dataset_id", datasetId);
-	    return Optional.of(errorResponse);
-          }
-        });
+    return _openAIClient.chat().completions().create(request).thenApply(completion -> {
+      // response is a JSON string
+      String jsonString = completion.choices().get(0).message().content().get();
+      try {
+        JSONObject jsonObject = new JSONObject(jsonString);
+        jsonObject.put("dataset_id", datasetId);
+
+        // Cache the response
+        try {
+          cache.populateCache(cacheKey, message, jsonObject);
+        }
+        catch (Exception e) {
+          System.err.println("Warning: Failed to cache response for gene " + geneId + " and dataset " +
+              datasetId + ": " + e.getMessage());
+        }
+
+        return Optional.of(jsonObject);
+      }
+      catch (JSONException e) {
+        System.err.println("Error parsing JSON response for gene " + geneId + " and dataset " + datasetId +
+            ": " + e.getMessage());
+        System.err.println("Raw response: " + jsonString);
+        JSONObject errorResponse = new JSONObject().put("error", "Invalid JSON response").put("dataset_id",
+            datasetId);
+        return Optional.of(errorResponse);
+      }
+    });
   }
 
+  private Optional<JSONObject> sendExperimentSummariesToOpenAI(String geneId,
+      List<JSONObject> experiments, boolean populateIfNotPresent) {
+
+    String message = String.format(
+        "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
+            "```json\n%s\n```\n\n",
+        new JSONArray(experiments).toString()) +
+        "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.";
 
-  private static Optional<JSONObject> sendExperimentSummariesToOpenAI(String geneId, List<JSONObject> experiments, CacheMode cacheMode) {
-	
-    String message = String.format("Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
-	    "```json\n%s\n```\n\n", new JSONArray(experiments).toString()) +
-	    "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.";
-
-    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
-      .model(model)
-      .maxCompletionTokens(MAX_RESPONSE_TOKENS)
-      .responseFormat(ResponseFormatJsonSchema.builder()
-                      .jsonSchema(JsonSchema.builder()
-                                  .name("expression-summary")
-                                  .schema(finalResponseSchema)
-                                  .build())
-                      .build())
-      .addSystemMessage(systemMessage)
-      .addUserMessage(message)
-      .build();
+    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder().model(
+        OPENAI_CHAT_MODEL).maxCompletionTokens(MAX_RESPONSE_TOKENS).responseFormat(
+            ResponseFormatJsonSchema.builder().jsonSchema(
+                JsonSchema.builder().name("expression-summary").schema(
+                    finalResponseSchema).build()).build()).addSystemMessage(systemMessage).addUserMessage(
+                        message).build();
 
     // System.out.println(message);
 
-    ChatCompletion completion = openAIClient.chat().completions().create(request).join(); // join() waits for the async response
+    ChatCompletion completion = _openAIClient.chat().completions().create(request).join(); // join() waits for
+                                                                                          // the async
+                                                                                          // response
     String jsonString = completion.choices().get(0).message().content().get();
     JSONObject rawResponseObject = new JSONObject(jsonString);
 
-    // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI
+    // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by
+    // AI
     JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments);
-	
+
     return Optional.of(finalResponseObject);
   }
 
-    
-  public static JSONObject consolidateSummary(JSONObject summaryResponse, List<JSONObject> individualResults) {
+  public static JSONObject consolidateSummary(JSONObject summaryResponse,
+      List<JSONObject> individualResults) {
     // Gather all dataset IDs from individualResults and map them to summaries
     Map<String, JSONObject> datasetSummaries = new HashMap<>();
     for (JSONObject result : individualResults) {
@@ -322,19 +324,21 @@ public static JSONObject consolidateSummary(JSONObject summaryResponse, List<JSO
 
       for (int j = 0; j < datasetIds.length(); j++) {
         String id = datasetIds.getString(j);
-                
+
         // Warn and skip if the id doesn't exist
         if (!datasetSummaries.containsKey(id)) {
-          System.out.println("WARNING: summary section id '" + id + "' does not exist. Excluding from final output.");
+          System.out.println(
+              "WARNING: summary section id '" + id + "' does not exist. Excluding from final output.");
           continue;
         }
         // Skip if we've seen it
-        if (seenDatasetIds.contains(id)) continue;
-                
+        if (seenDatasetIds.contains(id))
+          continue;
+
         seenDatasetIds.add(id);
         summaries.put(datasetSummaries.get(id));
       }
-            
+
       // Update section with mapped summaries and remove dataset_ids key
       section.put("summaries", summaries);
       section.remove("dataset_ids");
@@ -351,10 +355,11 @@ public static JSONObject consolidateSummary(JSONObject summaryResponse, List<JSO
       for (String id : missingDatasetIds) {
         otherSummaries.put(datasetSummaries.get(id));
       }
-            
+
       JSONObject otherSection = new JSONObject();
       otherSection.put("headline", "Other");
-      otherSection.put("one_sentence_summary", "These experiments were not grouped into sub-sections by the AI.");
+      otherSection.put("one_sentence_summary",
+          "These experiments were not grouped into sub-sections by the AI.");
       otherSection.put("summaries", otherSummaries);
       deduplicatedSections.put(otherSection);
     }

From 86dad079a60df845021bcdecac72c7f7061704d6 Mon Sep 17 00:00:00 2001
From: Ryan Doherty <tech@conical.org>
Date: Fri, 21 Feb 2025 10:04:03 -0500
Subject: [PATCH 11/31] Checkpoint commit; finished up AiExpressionCache and
 just need to trim down Summarizer

---
 .../ai/SingleGeneAiExpressionReporter.java    |  20 +-
 .../ai/expression/AiExpressionCache.java      | 287 ++++++++++++++----
 .../ai/expression/GeneRecordProcessor.java    |  26 +-
 .../report/ai/expression/Summarizer.java      |  19 +-
 4 files changed, 262 insertions(+), 90 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index ee0eb86eb..8686e5ad1 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -11,10 +11,9 @@
 import org.apidb.apicommon.model.TranscriptUtil;
 import org.apidb.apicommon.model.report.ai.expression.AiExpressionCache;
 import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor;
-import org.apidb.apicommon.model.report.ai.expression.Summarizer;
 import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs;
+import org.apidb.apicommon.model.report.ai.expression.Summarizer;
 import org.gusdb.wdk.model.WdkModelException;
-import org.gusdb.wdk.model.WdkUserException;
 import org.gusdb.wdk.model.answer.stream.RecordStream;
 import org.gusdb.wdk.model.answer.stream.RecordStreamFactory;
 import org.gusdb.wdk.model.record.RecordClass;
@@ -86,8 +85,8 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
 
         // fetch summary, producing if necessary and requested
         JSONObject expressionSummary = _populateIfNotPresent
-            ? getSummary(summaryInputs, summarizer, cache)
-            : readSummary(summaryInputs, cache);
+            ? cache.populateSummary(summaryInputs, summarizer::describeExperiment, summarizer::summarizeExperiments)
+            : cache.readSummary(summaryInputs);
 
         // join entries with commas
         if (firstRecord) firstRecord = false; else writer.write(",");
@@ -98,17 +97,4 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
       }
     }
   }
-
-  private JSONObject getSummary(GeneSummaryInputs summaryInputs, Summarizer summarizer, AiExpressionCache cache) {
-    try {
-      
-    }
-  }
-
-  private JSONObject readSummary(GeneSummaryInputs summaryInputs, AiExpressionCache cache) {
-    try {
-      
-    }
-  }
-
 }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
index 1b48ea698..a34f07103 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -4,28 +4,61 @@
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
 import java.util.Optional;
-import java.util.Set;
+import java.util.concurrent.CompletableFuture;
+import java.util.function.Predicate;
 
+import org.apache.log4j.Logger;
+import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.ExperimentInputs;
 import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs;
 import org.gusdb.fgputil.cache.disk.OnDiskCache;
 import org.gusdb.fgputil.cache.disk.OnDiskCache.EntryNotCreatedException;
-import org.gusdb.fgputil.cache.disk.OnDiskCache.Overwrite;
 import org.gusdb.fgputil.functional.FunctionalInterfaces.ConsumerWithException;
 import org.gusdb.fgputil.functional.FunctionalInterfaces.FunctionWithException;
+import org.gusdb.fgputil.functional.FunctionalInterfaces.PredicateWithException;
+import org.gusdb.fgputil.functional.FunctionalInterfaces.SupplierWithException;
 import org.gusdb.wdk.model.WdkModel;
 import org.json.JSONException;
 import org.json.JSONObject;
 
 public class AiExpressionCache {
 
-  // constants to determine cache location
+  private static Logger LOG = Logger.getLogger(AiExpressionCache.class);
+
+  // cache location
   private static final String CACHE_DIR_PROP_NAME = "AI_EXPRESSION_CACHE_DIR";
   private static final String DEFAULT_TMP_CACHE_SUBDIR = "expressionCache";
 
+  // catch characteristics
   private static final long DEFAULT_TIMEOUT_MILLIS = 5000;
   private static final long DEFAULT_POLL_FREQUENCY_MILLIS = 500;
 
+  // cache filenames
+  private static final String CACHED_DATA_FILE = "cached_data.txt";
+  private static final String CACHE_DIGEST_FILE = "digest.txt";
+
+  // returned JSON props and values
+  private static final String CACHE_STATUS = "cacheStatus"; // hit or miss
+  private static final String CACHE_HIT = "hit";
+  private static final String HIT_RESULT = "expressionSummary"; // if hit, will have result
+  private static final String CACHE_MISS = "miss";
+  private static final String MISS_REASON = "reason";  // if miss, will have reason
+
+  // status messages
+  private static class LookupException extends Exception {
+    public static final LookupException EXPIRED_ENTRY = new LookupException("Expired entry");
+    public static final LookupException CORRUPTED_ENTRY = new LookupException("Corrupted entry");
+    public static final LookupException MISSING_ENTRY = new LookupException("Missing entry");
+    private LookupException(String msg) { super(msg); }
+    public JSONObject toJson() {
+      return new JSONObject()
+          .put(CACHE_STATUS, CACHE_MISS)
+          .put(MISS_REASON, getMessage());
+    }
+  }
+
   // singleton pattern
   private static AiExpressionCache _instance;
 
@@ -40,9 +73,11 @@ else if (_instance._wdkModel != wdkModel) {
     return _instance;
   }
 
+  // private fields
   private final WdkModel _wdkModel;
   private final OnDiskCache _cache;
 
+  // constructor
   public AiExpressionCache(WdkModel wdkModel) throws IOException {
     _wdkModel = wdkModel;
 
@@ -52,83 +87,221 @@ public AiExpressionCache(WdkModel wdkModel) throws IOException {
         .orElse(Paths.get(_wdkModel.getModelConfig().getWdkTempDir().toString(), DEFAULT_TMP_CACHE_SUBDIR));
 
     _cache = new OnDiskCache(cacheParentDir, DEFAULT_TIMEOUT_MILLIS, DEFAULT_POLL_FREQUENCY_MILLIS);
+  }
 
+  /**
+   * Tries to read a gene summary from the cache without populating if absent.
+   *
+   * @param summaryInputs inputs for cache lookup
+   * @return response JSON (indicating cache hit or not with data or miss reason respectively)
+   */
+  public JSONObject readSummary(GeneSummaryInputs summaryInputs) {
+    try {
+      return _cache.visitContent(summaryInputs.getGeneId(),
+          geneDir -> getValidSummary(geneDir, summaryInputs));
+    }
+    catch (LookupException e) {
+      return e.toJson();
+    }
+    catch (EntryNotCreatedException e) {
+      return LookupException.MISSING_ENTRY.toJson();
+    }
+    catch (Exception e) {
+      // any other exception is a 500
+      throw e instanceof RuntimeException ? (RuntimeException)e : new RuntimeException(e);
+    }
   }
 
-  public void blah() {
-    _cache.populateAndProcessContent(geneId, populator, visitor, overwritePredicate)
+  /**
+   * Confirms experiment descriptors are present and up to date with the inputs; if so,
+   * confirms summary is up to date with the inputs; if so, returns it.  If anything is
+   * missing or out of date, returns cache-miss JSON.
+   *
+   * @param geneDir directory for the summary entry
+   * @param summaryInputs inputs
+   * @return response JSON (indicating cache hit or not with data or miss reason respectively)
+   * @throws Exception lookup or other exception if unable to find or validate cached data
+   */
+  private JSONObject getValidSummary(Path geneDir, GeneSummaryInputs summaryInputs) throws Exception {
+
+    // check for existence of valid cache entries for each experiment
+    // if any are missing or expired, exception will be thrown causing a cache miss
+    for (ExperimentInputs datasetInput : summaryInputs.getExperimentsWithData()) {
+      _cache.visitContent(datasetInput.getCacheKey(), experimentDir -> {
+        return getValidStoredData(experimentDir, datasetInput.getDigest());
+      });
+    }
+
+    // once all experiment values are confirmed, check for valid summary entry
+    JSONObject summary = getValidStoredData(geneDir, summaryInputs.getDigest());
+    return new JSONObject()
+        .put(CACHE_STATUS, CACHE_HIT)
+        .put(HIT_RESULT, summary);
   }
 
-  // Check if cached data is valid
-  public boolean isCacheValid(GeneSummaryInputs summaryInputs) {
+  /**
+   * Checks an entry for a valid digest and readable data file; if valid and present, returns
+   * parsed JSON data
+   *
+   * @param entryDir directory of the entry (could be summary or experiment)
+   * @param computedDigest expected digest; mismatch indicates cache entry is expired
+   * @return JSON data for this entry
+   * @throws IOException if unable to read files from disk
+   * @throws LookupException if entry is expired or corrupted
+   */
+  private static JSONObject getValidStoredData(Path entryDir, String computedDigest) throws IOException, LookupException {
+
+    // 1. check digest against existing value
+    if (!digestsMatch(entryDir, computedDigest)) {
+      throw LookupException.EXPIRED_ENTRY;
+    }
+
+    // 2. check for presence of cached data, then read
+    return readCachedData(entryDir)
+        .orElseThrow(() -> LookupException.CORRUPTED_ENTRY);
+  }
+
+  /**
+   * Checks if contents of digest file in the passed entry dir match a passed
+   * computed digest; returns false if file is missing or digests don't match, else true\
+   *
+   * @param entryDir entry directory
+   * @param computedDigest digest to which existing digest should be compared
+   * @return whether digests match
+   * @throws IOException if unable to read file
+   */
+  private static boolean digestsMatch(Path entryDir, String computedDigest) throws IOException {
+    Path digestFile = entryDir.resolve(CACHE_DIGEST_FILE);
+    return Files.exists(digestFile) &&
+        Files.readString(digestFile).equals(computedDigest);
+  }
+
+  /**
+   * Read cached data file from entry, returns empty optional if data file
+   * does not exist or is unable to read or parsed into JSON.
+   *
+   * @param entryDir entry directory
+   * @return optional entry data
+   */
+  private static Optional<JSONObject> readCachedData(Path entryDir) {
     try {
-      FunctionWithException<Path, Boolean> visitor = entryDir -> {
-        Path digestFile = entryDir.resolve("digest.txt");
+      Path file = entryDir.resolve(CACHED_DATA_FILE);
+      return Files.exists(file)
+        ? Optional.of(new JSONObject(Files.readString(file)))
+        : Optional.empty();
+    }
+    catch (IOException | JSONException e) {
+      LOG.error("Unable to read or parse cached data", e);
+      return Optional.empty();
+    }
+  }
 
-        if (!Files.exists(digestFile)) {
-          System.out.println("No digest file found.");
-          return false;
-        }
+  /**
+   * Returns a cached gene expression summary, generating and storing a new value if none
+   * exists or if the existing value is out of date with the passed digests.
+   *
+   * @param summaryInputs gene summary inputs
+   * @param experimentDescriber function to describe an experiment
+   * @param experimentSummarizer function to summarize experiments into an expression summary
+   * @return expression summary (will always be a cache hit)
+   */
+  public JSONObject populateSummary(GeneSummaryInputs summaryInputs,
+      FunctionWithException<JSONObject, CompletableFuture<JSONObject>> experimentDescriber,
+      FunctionWithException<List<JSONObject>, JSONObject> experimentSummarizer) {
+    try {
+      return _cache.populateAndProcessContent(summaryInputs.getGeneId(),
 
-        // Read stored digest and compare
-        String cachedDigest = Files.readString(digestFile);
+          // populator
+          entryDir -> {
+            // first populate each dataset entry as needed and collect experiment descriptors
+            List<JSONObject> experiments = populateExperiments(summaryInputs.getExperimentsWithData(), experimentDescriber);
 
-        if (cachedDigest.equals(summaryInputs.getExperimentsDigest())) {
-          System.out.println("Cache digest matches input.");
-          return true;
-        }
-        else {
-          System.out.println("Cache digest mismatch! Cache is out of date.");
-          return false;
-        }
-      };
+            // summarize experiments and store
+            getPopulator(summaryInputs.getDigest(), () -> experimentSummarizer.apply(experiments)).accept(entryDir);
+          },
 
-      return _cache.visitContent(summaryInputs.getGeneId(), visitor);
+          // visitor
+          entryDir -> getValidSummary(entryDir, summaryInputs),
 
-    }
-    catch (EntryNotCreatedException e) {
-      System.out.println("Cache entry does not exist yet.");
-      return false;
+          // repopulation predicate
+          exceptionToTrue(entryDir ->
+              // try to look up summary json; if cache miss, then repopulate
+              getValidSummary(entryDir, summaryInputs).getString(CACHE_STATUS).equals(CACHE_MISS)));
     }
     catch (Exception e) {
-      throw new RuntimeException("Error validating cache entry", e);
+      // any other exception is a 500
+      throw e instanceof RuntimeException ? (RuntimeException)e : new RuntimeException(e);
     }
   }
 
-  // Populate cache with computed data (Method 1: Takes computedData directly)
-  public void populateCache(GeneSummaryInputs summaryInputs, JSONObject computedData) throws Exception {
-    ConsumerWithException<Path> populator = entryDir -> {
-      Files.writeString(entryDir.resolve("cached_data.txt"), computedData.toString());
-      Files.writeString(entryDir.resolve("digest.txt"), summaryInputs.getExperimentsDigest());
-    };
+  /**
+   * Returns a set of cached experiment descriptions, generating and storing new values for any
+   * experiments not present or that are out of date (mismatched digests).  In this way, any new
+   * experiments do not result in regeneration of descriptors for previously released experiments.
+   *
+   * @param experimentData experiment inputs
+   * @param experimentDescriber function to describe an experiment
+   * @return list of cached experiment descriptions
+   * @throws Exception if unable to generate descriptions or store
+   */
+  private List<JSONObject> populateExperiments(List<ExperimentInputs> experimentData,
+      FunctionWithException<JSONObject, CompletableFuture<JSONObject>> experimentDescriber) throws Exception {
+    List<JSONObject> experiments = new ArrayList<>();
+    // start with serial generation; move back to parallel later
+    for (ExperimentInputs input : experimentData) {
+      experiments.add(_cache.populateAndProcessContent(input.getCacheKey(),
 
-    // Populate with overwrite policy (assumes caller ensures it's necessary)
-    _cache.populateAndProcessContent(summaryInputs.getGeneId(), populator, path -> null, Overwrite.YES);
-  }
+          // populator
+          getPopulator(input.getDigest(), () -> experimentDescriber.apply(input.getExperimentData()).get()),
 
-//  // Populate cache with a function that computes the result (Method 2: Uses a function)
-//  public void populateCache(String cacheKey, String inputData, Function<String, String> computation) throws Exception {
-//    populateCache(cacheKey, inputData, computation.apply(inputData));
-//  }
+          // visitor
+          experimentDir -> getValidStoredData(experimentDir, input.getDigest()),
 
+          // repopulation predicate
+          exceptionToTrue(experimentDir -> {
+              getValidStoredData(experimentDir, input.getDigest());
+              return false; // do not repopulate if able to look up valid value
+          })
+      ));
+    }
+    return experiments;
+  }
 
-  // Read cached data (throws IOException if missing)
-  public JSONObject readCachedData(String cacheKey) throws Exception {
-    FunctionWithException<Path, JSONObject> visitor = entryDir -> {
-      Path file = entryDir.resolve("cached_data.txt");
-      if (!Files.exists(file)) {
-	throw new IOException("Cache entry missing: " + file);
-      }
-      String fileContents = Files.readString(file);
+  /**
+   * Takes a predicate that throws an exception and returns a predicate that
+   * does not, converting any thrown exception to true
+   *
+   * @param predicate predicate that throws an exception
+   * @return the value returned by the passed predicate, or true if an exception is thrown
+   */
+  private Predicate<Path> exceptionToTrue(PredicateWithException<Path> predicate) {
+    return path -> {
       try {
-	JSONObject jsonObject = new JSONObject(fileContents);
-	return jsonObject;
-      } catch (JSONException e) {
-	throw e;
+        return predicate.test(path);
+      }
+      catch (Exception e) {
+        return true;
       }
     };
-    
-    return _cache.visitContent(cacheKey, visitor);
+  }
+
+  /**
+   * Returns a function that populates a cache entry with the passed
+   * digest and with data supplied by the passed supplier.
+   *
+   * @param digest digest to store
+   * @param dataSupplier supplier of data to store
+   * @return population function
+   */
+  private ConsumerWithException<Path> getPopulator(String digest, SupplierWithException<JSONObject> dataSupplier) {
+    return entryDir -> {
+
+      // write digest to digest file
+      Files.writeString(entryDir.resolve(CACHE_DIGEST_FILE), digest);
+
+      // write data
+      Files.writeString(entryDir.resolve(CACHED_DATA_FILE), dataSupplier.get().toString());
+    };
   }
 
 }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index 807cec0f3..9a2fc211c 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -1,9 +1,7 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
 import java.util.ArrayList;
-import java.util.LinkedHashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.Set;
 import java.util.function.Function;
 import java.util.stream.Collectors;
@@ -37,6 +35,8 @@ public interface ExperimentInputs {
 
     String getCacheKey();
 
+    String getDatasetId();
+
     String getDigest();
 
     JSONObject getExperimentData();
@@ -46,11 +46,11 @@ public interface GeneSummaryInputs {
 
     String getGeneId(); // is the cache key
 
-    Map<String,ExperimentInputs> getExperimentsWithData();
+    List<ExperimentInputs> getExperimentsWithData();
 
-    default String getExperimentsDigest() {
+    default String getDigest() {
       // TODO Does it make more sense to md5 the concatenation of the experiment hashes?
-      return EncryptionUtil.md5(getExperimentsWithData().values().stream()
+      return EncryptionUtil.md5(getExperimentsWithData().stream()
           .map(ExperimentInputs::getExperimentData)
           .map(JsonUtil::serialize)
           .collect(Collectors.joining()));
@@ -65,7 +65,7 @@ public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record
 
     String geneId = getGeneId(record);
 
-    Map<String,ExperimentInputs> experimentsWithData = GeneRecordProcessor.processExpressionData(record, experimentDigester, 0);
+    List<ExperimentInputs> experimentsWithData = GeneRecordProcessor.processExpressionData(record, experimentDigester, 0);
 
     return new GeneSummaryInputs() {
       @Override
@@ -74,16 +74,16 @@ public String getGeneId() {
       }
 
       @Override
-      public Map<String,ExperimentInputs> getExperimentsWithData() {
+      public List<ExperimentInputs> getExperimentsWithData() {
         return experimentsWithData;
       }
     };
   }
 
-  private static Map<String, ExperimentInputs> processExpressionData(RecordInstance record, Function<JSONObject, String> getExperimentPrompt, int maxExperiments) throws WdkModelException {
+  private static List<ExperimentInputs> processExpressionData(RecordInstance record, Function<JSONObject, String> getExperimentPrompt, int maxExperiments) throws WdkModelException {
     try {
       // return value:
-      Map<String, ExperimentInputs> experiments = new LinkedHashMap<>();
+      List<ExperimentInputs> experiments = new ArrayList<>();
 
       String geneId = getGeneId(record);
       TableValue expressionGraphs = record.getTableValue(EXPRESSION_GRAPH_TABLE);
@@ -104,7 +104,13 @@ private static Map<String, ExperimentInputs> processExpressionData(RecordInstanc
 
         experimentInfo.put("data", filteredData);
 
-        experiments.put(datasetId, new ExperimentInputs() {
+        experiments.add(new ExperimentInputs() {
+
+          @Override
+          public String getDatasetId() {
+            return datasetId;
+          }
+
           @Override
           public String getCacheKey() {
             return geneId + ':' + datasetId;
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 865d78457..c348a6d22 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -151,8 +151,12 @@ public JSONObject summarizeExpression(GeneSummaryInputs summaryInputs, boolean p
         // All experiment-level caches are valid, now check final summary cache
         Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, populateIfNotPresent);
 
-        return finalSummary.map(summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary",
-            summary)).orElseGet(() -> new JSONObject().put("cacheStatus", "miss"));
+        return finalSummary
+            .map(summary -> new JSONObject()
+                .put("cacheStatus", "hit")
+                .put("expressionSummary", summary))
+            .orElseGet(() -> new JSONObject()
+                .put("cacheStatus", "miss"));
       }
 
       // Send AI requests in parallel
@@ -174,9 +178,12 @@ public JSONObject summarizeExpression(GeneSummaryInputs summaryInputs, boolean p
       // System.exit(0);
 
       Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, populateIfNotPresent);
-      return finalSummary.map(
-          summary -> new JSONObject().put("cacheStatus", "hit").put("expressionSummary", summary)).orElseGet(
-              () -> new JSONObject().put("cacheStatus", "miss"));
+      return finalSummary
+          .map(summary -> new JSONObject()
+              .put("cacheStatus", "hit")
+              .put("expressionSummary", summary))
+          .orElseGet(() -> new JSONObject()
+              .put("cacheStatus", "miss"));
     }
     catch (WdkModelException e) {
 
@@ -275,7 +282,7 @@ else if (!populateIfNotPresent) {
   }
 
   private Optional<JSONObject> sendExperimentSummariesToOpenAI(String geneId,
-      List<JSONObject> experiments, boolean populateIfNotPresent) {
+      List<JSONObject> experiments) {
 
     String message = String.format(
         "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +

From be6a61a21f376510315a470d446766d8edacb7bf Mon Sep 17 00:00:00 2001
From: Ryan Doherty <tech@conical.org>
Date: Fri, 21 Feb 2025 15:44:36 -0500
Subject: [PATCH 12/31] Clean cache logic out of summarizer

---
 .../ai/SingleGeneAiExpressionReporter.java    |   2 +-
 .../ai/expression/AiExpressionCache.java      |   6 +-
 .../report/ai/expression/Summarizer.java      | 203 +++++-------------
 3 files changed, 55 insertions(+), 156 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index 8686e5ad1..e3e5808ce 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -91,7 +91,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
         // join entries with commas
         if (firstRecord) firstRecord = false; else writer.write(",");
 
-        // write JSON object
+        // write JSON object property, keyed by gene ID
         writer.write("\"" + summaryInputs.getGeneId() + "\":" + expressionSummary.toString());
 
       }
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
index a34f07103..da73895d3 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -206,7 +206,7 @@ private static Optional<JSONObject> readCachedData(Path entryDir) {
    * @return expression summary (will always be a cache hit)
    */
   public JSONObject populateSummary(GeneSummaryInputs summaryInputs,
-      FunctionWithException<JSONObject, CompletableFuture<JSONObject>> experimentDescriber,
+      FunctionWithException<ExperimentInputs, CompletableFuture<JSONObject>> experimentDescriber,
       FunctionWithException<List<JSONObject>, JSONObject> experimentSummarizer) {
     try {
       return _cache.populateAndProcessContent(summaryInputs.getGeneId(),
@@ -245,14 +245,14 @@ public JSONObject populateSummary(GeneSummaryInputs summaryInputs,
    * @throws Exception if unable to generate descriptions or store
    */
   private List<JSONObject> populateExperiments(List<ExperimentInputs> experimentData,
-      FunctionWithException<JSONObject, CompletableFuture<JSONObject>> experimentDescriber) throws Exception {
+      FunctionWithException<ExperimentInputs, CompletableFuture<JSONObject>> experimentDescriber) throws Exception {
     List<JSONObject> experiments = new ArrayList<>();
     // start with serial generation; move back to parallel later
     for (ExperimentInputs input : experimentData) {
       experiments.add(_cache.populateAndProcessContent(input.getCacheKey(),
 
           // populator
-          getPopulator(input.getDigest(), () -> experimentDescriber.apply(input.getExperimentData()).get()),
+          getPopulator(input.getDigest(), () -> experimentDescriber.apply(input).get()),
 
           // visitor
           experimentDir -> getValidStoredData(experimentDir, input.getDigest()),
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index c348a6d22..cf20e7f20 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -1,23 +1,15 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
-import java.io.IOException;
-import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
-import java.util.Optional;
 import java.util.Set;
 import java.util.concurrent.CompletableFuture;
-import java.util.stream.Collectors;
 
-import org.apache.log4j.Logger;
-import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs;
+import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.ExperimentInputs;
 import org.gusdb.fgputil.json.JsonUtil;
 import org.gusdb.wdk.model.WdkModel;
-import org.gusdb.wdk.model.WdkModelException;
-import org.gusdb.wdk.model.WdkUserException;
-import org.gusdb.wdk.model.record.RecordInstance;
 import org.json.JSONArray;
 import org.json.JSONException;
 import org.json.JSONObject;
@@ -33,14 +25,13 @@
 
 public class Summarizer {
 
-  private static final Logger LOG = Logger.getLogger(Summarizer.class);
-
   // provide exact model number for semi-reproducibility
-  public static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
+  // TODO: should this be incorporated into the digests, so if we change the chat model, all generated summaries become expired?
+  private static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
 
   private static final int MAX_RESPONSE_TOKENS = 10000;
     
-  private static final String systemMessage = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data";
+  private static final String SYSTEM_MESSAGE = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data";
 
   // Prepare JSON schemas for structured responses
   // NOTE: this code is horrible to look at/read. It would be better to just define the schemas as JSON strings
@@ -122,77 +113,6 @@ public Summarizer(WdkModel wdkModel) {
         .build();
   }
 
-  public JSONObject summarizeExpression(GeneSummaryInputs summaryInputs, boolean populateIfNotPresent)
-      throws WdkUserException {
-
-    try {
-
-      // Process expression data further into a list of pruned metadata plus data
-      List<JSONObject> experimentsWithData = summaryInputs.getExperimentsWithData();
-      String geneId = summaryInputs.getGeneId();
-      System.out.println("Pre-processed Experiments: " + experimentsWithData.size());
-
-      // TEST Mode: Collect valid cache entries
-      if (!populateIfNotPresent) {
-        List<JSONObject> cachedResponses = new ArrayList<>();
-
-        for (JSONObject experiment : experimentsWithData) {
-
-          Optional<JSONObject> experimentSummary = sendExperimentToOpenAI(geneId, experiment, populateIfNotPresent).join();
-
-          if (experimentSummary.isPresent()) {
-            cachedResponses.add(experimentSummary.get());
-          }
-          else {
-            return new JSONObject().put("cacheStatus", "miss"); // If any cache entry is missing, return early
-          }
-        }
-
-        // All experiment-level caches are valid, now check final summary cache
-        Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, cachedResponses, populateIfNotPresent);
-
-        return finalSummary
-            .map(summary -> new JSONObject()
-                .put("cacheStatus", "hit")
-                .put("expressionSummary", summary))
-            .orElseGet(() -> new JSONObject()
-                .put("cacheStatus", "miss"));
-      }
-
-      // Send AI requests in parallel
-      List<CompletableFuture<Optional<JSONObject>>> aiRequests = experimentsWithData.stream()
-          // TO DO - potentially some optimisation?
-          // .map(exp -> CompletableFuture.supplyAsync(() -> sendExperimentToOpenAI(geneId, exp,
-          // CacheMode.POPULATE)))
-          .map(exp -> sendExperimentToOpenAI(geneId, exp, populateIfNotPresent)).collect(Collectors.toList());
-      // Wait for all requests to complete with `join`
-      List<JSONObject> responses = aiRequests.stream().map(CompletableFuture::join) // Get
-                                                                                    // Optional<JSONObject>
-          .filter(Optional::isPresent) // Keep only non-empty results
-          .map(Optional::get) // Extract JSONObject
-          .collect(Collectors.toList());
-
-      // Debug output
-      // System.out.println("Individual responses:");
-      // responses.forEach(response -> System.out.println(response.toString(2)));
-      // System.exit(0);
-
-      Optional<JSONObject> finalSummary = sendExperimentSummariesToOpenAI(geneId, responses, populateIfNotPresent);
-      return finalSummary
-          .map(summary -> new JSONObject()
-              .put("cacheStatus", "hit")
-              .put("expressionSummary", summary))
-          .orElseGet(() -> new JSONObject()
-              .put("cacheStatus", "miss"));
-    }
-    catch (WdkModelException e) {
-
-      // Handle errors gracefully
-      System.err.println("Error fetching expression data: " + e.getMessage());
-      throw new WdkUserException(e);
-    }
-  }
-
   public static String getExperimentMessage(JSONObject experiment) {
 
     // Possible TO DO: AI EDIT DESCRIPTION
@@ -221,35 +141,20 @@ public static String getExperimentMessage(JSONObject experiment) {
         "**Further guidance**: The `y_axis` field describes the `value` field in the `data` array, which is the primary expression level datum. Note that standard error statistics are only available when biological replicates were performed. However, percentile-normalized values can also guide your assessment of importance. If this is a time-series experiment, consider if it is cyclical and assess periodicity as appropriate. Ignore all discussion of individual or groups of genes in the experiment `description`, as this is irrelevant to the gene you are summarising. For RNA-Seq experiments, be aware that if `paralog_number` is high, interpretation may be tricky (consider both unique and non-unique counts if available). Ensure that each key appears exactly once in the JSON response. Do not include any duplicate fields.";
   }
 
-  private CompletableFuture<Optional<JSONObject>> sendExperimentToOpenAI(String geneId,
-      JSONObject experiment, boolean populateIfNotPresent) {
-
-    String cacheKey = geneId + ':' + datasetId;
-
-    if (_cache.isCacheValid(cacheKey, message)) {
-      try {
-        JSONObject cachedResponse = cache.readCachedData(cacheKey);
-        return CompletableFuture.completedFuture(Optional.of(cachedResponse));
-      }
-      catch (Exception e) {
-        System.err.println("Cache read failed for key " + cacheKey + ": " + e.getMessage());
-
-        if (!populateIfNotPresent) {
-          return CompletableFuture.completedFuture(Optional.empty()); // Treat as cache miss
-        }
-        // Else, log and fall through to AI generation
-      }
-    }
-    else if (!populateIfNotPresent) {
-      return CompletableFuture.completedFuture(Optional.empty());
-    }
-
-    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder().model(
-        OPENAI_CHAT_MODEL).maxCompletionTokens(MAX_RESPONSE_TOKENS).responseFormat(
-            ResponseFormatJsonSchema.builder().jsonSchema(
-                JsonSchema.builder().name("experiment-summary").schema(
-                    experimentResponseSchema).build()).build()).addSystemMessage(
-                        systemMessage).addUserMessage(message).build();
+  public CompletableFuture<JSONObject> describeExperiment(ExperimentInputs experimentInputs) {
+
+    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
+        .model(OPENAI_CHAT_MODEL)
+        .maxCompletionTokens(MAX_RESPONSE_TOKENS)
+        .responseFormat(ResponseFormatJsonSchema.builder()
+            .jsonSchema(JsonSchema.builder()
+                .name("experiment-summary")
+                .schema(experimentResponseSchema)
+                .build())
+            .build())
+        .addSystemMessage(SYSTEM_MESSAGE)
+        .addUserMessage(getExperimentMessage(experimentInputs.getExperimentData()))
+        .build();
 
     // add dataset_id back to the response
     return _openAIClient.chat().completions().create(request).thenApply(completion -> {
@@ -257,62 +162,56 @@ else if (!populateIfNotPresent) {
       String jsonString = completion.choices().get(0).message().content().get();
       try {
         JSONObject jsonObject = new JSONObject(jsonString);
-        jsonObject.put("dataset_id", datasetId);
-
-        // Cache the response
-        try {
-          cache.populateCache(cacheKey, message, jsonObject);
-        }
-        catch (Exception e) {
-          System.err.println("Warning: Failed to cache response for gene " + geneId + " and dataset " +
-              datasetId + ": " + e.getMessage());
-        }
-
-        return Optional.of(jsonObject);
+        jsonObject.put("dataset_id", experimentInputs.getDatasetId());
+        return jsonObject;
       }
       catch (JSONException e) {
-        System.err.println("Error parsing JSON response for gene " + geneId + " and dataset " + datasetId +
-            ": " + e.getMessage());
-        System.err.println("Raw response: " + jsonString);
-        JSONObject errorResponse = new JSONObject().put("error", "Invalid JSON response").put("dataset_id",
-            datasetId);
-        return Optional.of(errorResponse);
+        throw new RuntimeException(
+            "Error parsing JSON response for dataset " + experimentInputs.getDatasetId() +
+            ".  Raw response string:\n" + jsonString + "\n", e);
       }
     });
   }
 
-  private Optional<JSONObject> sendExperimentSummariesToOpenAI(String geneId,
-      List<JSONObject> experiments) {
+  public JSONObject summarizeExperiments(List<JSONObject> experiments) {
 
-    String message = String.format(
+    String message = 
         "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
-            "```json\n%s\n```\n\n",
-        new JSONArray(experiments).toString()) +
+        String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) +
         "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.";
 
-    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder().model(
-        OPENAI_CHAT_MODEL).maxCompletionTokens(MAX_RESPONSE_TOKENS).responseFormat(
-            ResponseFormatJsonSchema.builder().jsonSchema(
-                JsonSchema.builder().name("expression-summary").schema(
-                    finalResponseSchema).build()).build()).addSystemMessage(systemMessage).addUserMessage(
-                        message).build();
+    ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
+        .model(OPENAI_CHAT_MODEL)
+        .maxCompletionTokens(MAX_RESPONSE_TOKENS)
+        .responseFormat(ResponseFormatJsonSchema.builder()
+            .jsonSchema(JsonSchema.builder()
+                .name("expression-summary")
+                .schema(finalResponseSchema)
+                .build())
+            .build())
+        .addSystemMessage(SYSTEM_MESSAGE)
+        .addUserMessage(message)
+        .build();
 
-    // System.out.println(message);
+    ChatCompletion completion = _openAIClient.chat().completions().create(request)
+        .join(); // join() waits for the async response
 
-    ChatCompletion completion = _openAIClient.chat().completions().create(request).join(); // join() waits for
-                                                                                          // the async
-                                                                                          // response
     String jsonString = completion.choices().get(0).message().content().get();
-    JSONObject rawResponseObject = new JSONObject(jsonString);
+    try {
+      JSONObject rawResponseObject = new JSONObject(jsonString);
 
-    // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by
-    // AI
-    JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments);
+      // quality control (remove bad `dataset_id`s) and add 'Others' section for any experiments not listed by AI
+      JSONObject finalResponseObject = consolidateSummary(rawResponseObject, experiments);
 
-    return Optional.of(finalResponseObject);
+      return finalResponseObject;
+    }
+    catch (JSONException e) {
+      throw new RuntimeException("Error parsing JSON response " +
+          "for gene summary.  Raw response string:\n" + jsonString + "\n", e);
+    }
   }
 
-  public static JSONObject consolidateSummary(JSONObject summaryResponse,
+  private static JSONObject consolidateSummary(JSONObject summaryResponse,
       List<JSONObject> individualResults) {
     // Gather all dataset IDs from individualResults and map them to summaries
     Map<String, JSONObject> datasetSummaries = new HashMap<>();

From 32a73518ba77ebf6f3f93f9e1bfedaabb9c0d733 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Sat, 22 Feb 2025 18:23:49 +0000
Subject: [PATCH 13/31] make cache validation digest symmetrical for both
 levels of AI query

---
 .../ai/SingleGeneAiExpressionReporter.java    |  3 ++-
 .../ai/expression/GeneRecordProcessor.java    | 27 ++++++++++++-------
 .../report/ai/expression/Summarizer.java      | 11 +++++---
 3 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index e3e5808ce..1b80ca346 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -81,7 +81,8 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
       for (RecordInstance record : recordStream) {
 
         // create summary inputs
-        GeneSummaryInputs summaryInputs = GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer::getExperimentMessage);
+        GeneSummaryInputs summaryInputs =
+	  GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer::getExperimentMessage, Summarizer::getFinalSummaryMessage);
 
         // fetch summary, producing if necessary and requested
         JSONObject expressionSummary = _populateIfNotPresent
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index 9a2fc211c..ceee1482f 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -48,24 +48,18 @@ public interface GeneSummaryInputs {
 
     List<ExperimentInputs> getExperimentsWithData();
 
-    default String getDigest() {
-      // TODO Does it make more sense to md5 the concatenation of the experiment hashes?
-      return EncryptionUtil.md5(getExperimentsWithData().stream()
-          .map(ExperimentInputs::getExperimentData)
-          .map(JsonUtil::serialize)
-          .collect(Collectors.joining()));
-    }
+    String getDigest();
   }
 
   private static String getGeneId(RecordInstance record) {
     return record.getPrimaryKey().getValues().get("gene_source_id");
   }
 
-  public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, Function<JSONObject, String> experimentDigester) throws WdkModelException {
+  public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, Function<JSONObject, String> getExperimentPrompt, Function<List<JSONObject>, String> getFinalSummaryPrompt) throws WdkModelException {
 
     String geneId = getGeneId(record);
 
-    List<ExperimentInputs> experimentsWithData = GeneRecordProcessor.processExpressionData(record, experimentDigester, 0);
+    List<ExperimentInputs> experimentsWithData = GeneRecordProcessor.processExpressionData(record, getExperimentPrompt, 0);
 
     return new GeneSummaryInputs() {
       @Override
@@ -77,6 +71,21 @@ public String getGeneId() {
       public List<ExperimentInputs> getExperimentsWithData() {
         return experimentsWithData;
       }
+
+      @Override
+      public String getDigest() {
+	// Instead of building the final summary prompt using the AI-generated **summary outputs**
+	// (which happens during real processing), we construct it using JSON-encoded MD5
+	// **digests** of the per-experiment **inputs**.
+	//
+	// This avoids fetching per-experiment results from the cache while remaining 
+	// functionally identical for cache validation purposes.
+	List<JSONObject> digests = experimentsWithData.stream()
+	  .map(exp -> new JSONObject().put("digest", exp.getDigest()))
+	  .collect(Collectors.toList());
+	return EncryptionUtil.md5(getFinalSummaryPrompt.apply(digests));
+      }
+
     };
   }
 
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index cf20e7f20..8a8ae7aa1 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -173,13 +173,16 @@ public CompletableFuture<JSONObject> describeExperiment(ExperimentInputs experim
     });
   }
 
-  public JSONObject summarizeExperiments(List<JSONObject> experiments) {
+  public static String getFinalSummaryMessage(List<JSONObject> experiments) {
 
-    String message = 
-        "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
+    return "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
         String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) +
         "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.";
 
+  }
+  
+  public JSONObject summarizeExperiments(List<JSONObject> experiments) {
+
     ChatCompletionCreateParams request = ChatCompletionCreateParams.builder()
         .model(OPENAI_CHAT_MODEL)
         .maxCompletionTokens(MAX_RESPONSE_TOKENS)
@@ -190,7 +193,7 @@ public JSONObject summarizeExperiments(List<JSONObject> experiments) {
                 .build())
             .build())
         .addSystemMessage(SYSTEM_MESSAGE)
-        .addUserMessage(message)
+        .addUserMessage(getFinalSummaryMessage(experiments))
         .build();
 
     ChatCompletion completion = _openAIClient.chat().completions().create(request)

From 44b65aab7f49474aa691125bb71c5e3287a5e214 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Sat, 22 Feb 2025 18:49:45 +0000
Subject: [PATCH 14/31] improved code formatting of JSONSchema definitions

---
 .../report/ai/expression/Summarizer.java      | 96 +++++++------------
 1 file changed, 34 insertions(+), 62 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 8a8ae7aa1..3207a0bd0 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -34,72 +34,44 @@ public class Summarizer {
   private static final String SYSTEM_MESSAGE = "You are a bioinformatician working for VEuPathDB.org. You are an expert at providing biologist-friendly summaries of transcriptomic data";
 
   // Prepare JSON schemas for structured responses
-  // NOTE: this code is horrible to look at/read. It would be better to just define the schemas as JSON strings
-  // but this is only really nice when we have """ text block """ support, coming soon when we upgrade, perhaps?
-  private static final JsonSchema.Schema experimentResponseSchema =
-    JsonSchema.Schema.builder()
+  private static final JsonSchema.Schema experimentResponseSchema = JsonSchema.Schema.builder()
     .putAdditionalProperty("type", JsonValue.from("object"))
-    .putAdditionalProperty("properties",
-                           JsonValue
-                           .from(Map
-                                 .of(
-                                     "one_sentence_summary", Map.of("type", "string"),
-                                     "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5),
-                                     "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5),
-                                     "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")), 
-                                     "notes", Map.of("type", "string")
-                                     )
-                                 )
-                           )
-    .putAdditionalProperty("required",
-                           JsonValue.from(
-                                          List.of(
-                                                  "one_sentence_summary",
-                                                  "biological_importance",
-                                                  "confidence",
-                                                  "experiment_keywords",
-                                                  "notes")
-                                          )
-                           )
+    .putAdditionalProperty("properties", JsonValue.from(Map.of(
+          "one_sentence_summary", Map.of("type", "string"),
+          "biological_importance", Map.of("type", "integer", "minimum", 0, "maximum", 5),
+          "confidence", Map.of("type", "integer", "minimum", 0, "maximum", 5),
+          "experiment_keywords", Map.of("type", "array", "items", Map.of("type", "string")),
+          "notes", Map.of("type", "string")
+    )))
+    .putAdditionalProperty("required", JsonValue.from(List.of(
+          "one_sentence_summary",
+          "biological_importance",
+          "confidence",
+          "experiment_keywords",
+          "notes"
+    )))
     .build();
 
-  private static final JsonSchema.Schema finalResponseSchema =
-    JsonSchema.Schema.builder()
+  private static final JsonSchema.Schema finalResponseSchema = JsonSchema.Schema.builder()
     .putAdditionalProperty("type", JsonValue.from("object"))
-    .putAdditionalProperty("properties",
-                           JsonValue
-                           .from(Map
-                                 .of(
-                                     "headline", Map.of("type", "string"),
-                                     "one_paragraph_summary", Map.of("type", "string"),
-                                     "sections",
-                                     Map.of("type", "array",
-                                            "minimum", 1,
-                                            "items",
-                                            Map.of(
-                                                   "type", "object",
-                                                   "required", List.of("headline", "one_sentence_summary", "dataset_ids"),
-                                                   "properties",
-                                                   Map.of(
-                                                          "headline", Map.of("type", "string"),
-                                                          "one_sentence_summary", Map.of("type", "string"),
-                                                          "dataset_ids", Map.of("type", "array",
-                                                                                "items", Map.of("type", "string"))
-                                                          )
-                                                   )
-                                            )
-                                     )
-                                 )
-                           )
-    .putAdditionalProperty("required",
-                           JsonValue.from(
-                                          List.of(
-                                                  "headline",
-                                                  "one_paragraph_summary",
-                                                  "dataset_ids"
-                                                  )
-                                          )
-                           )
+    .putAdditionalProperty("properties", JsonValue.from(Map.of(
+          "headline", Map.of("type", "string"),
+          "one_paragraph_summary", Map.of("type", "string"),
+          "sections", Map.of("type", "array", "minimum", 1, "items", Map.of(
+              "type", "object",
+              "required", List.of("headline", "one_sentence_summary", "dataset_ids"),
+              "properties", Map.of(
+                  "headline", Map.of("type", "string"),
+                  "one_sentence_summary", Map.of("type", "string"),
+                  "dataset_ids", Map.of("type", "array", "items", Map.of("type", "string"))
+              )
+          ))
+    )))
+    .putAdditionalProperty("required", JsonValue.from(List.of(
+          "headline",
+          "one_paragraph_summary",
+          "dataset_ids"
+    )))
     .build();
 
   private static final String OPENAI_API_KEY_PROP_NAME = "OPENAI_API_KEY";

From 4bcb86f670c4103af5ba8d41700e70ba5ae2085d Mon Sep 17 00:00:00 2001
From: Ryan Doherty <tech@conical.org>
Date: Sun, 23 Feb 2025 21:24:18 -0500
Subject: [PATCH 15/31] Incorporate AI chat model string into result digests;
 they will become out-of-date if the chat model changes

---
 .../ai/SingleGeneAiExpressionReporter.java    |  3 +-
 .../ai/expression/GeneRecordProcessor.java    | 29 +++++++++----------
 .../report/ai/expression/Summarizer.java      |  3 +-
 3 files changed, 17 insertions(+), 18 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index 1b80ca346..07919351d 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -82,7 +82,8 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
 
         // create summary inputs
         GeneSummaryInputs summaryInputs =
-	  GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer::getExperimentMessage, Summarizer::getFinalSummaryMessage);
+            GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer.OPENAI_CHAT_MODEL.asString(),
+                Summarizer::getExperimentMessage, Summarizer::getFinalSummaryMessage);
 
         // fetch summary, producing if necessary and requested
         JSONObject expressionSummary = _populateIfNotPresent
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index ceee1482f..c82dede69 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -7,7 +7,6 @@
 import java.util.stream.Collectors;
 
 import org.gusdb.fgputil.EncryptionUtil;
-import org.gusdb.fgputil.json.JsonUtil;
 import org.gusdb.wdk.model.WdkModelException;
 import org.gusdb.wdk.model.WdkUserException;
 import org.gusdb.wdk.model.record.RecordInstance;
@@ -55,11 +54,11 @@ private static String getGeneId(RecordInstance record) {
     return record.getPrimaryKey().getValues().get("gene_source_id");
   }
 
-  public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, Function<JSONObject, String> getExperimentPrompt, Function<List<JSONObject>, String> getFinalSummaryPrompt) throws WdkModelException {
+  public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, String aiChatModel, Function<JSONObject, String> getExperimentPrompt, Function<List<JSONObject>, String> getFinalSummaryPrompt) throws WdkModelException {
 
     String geneId = getGeneId(record);
 
-    List<ExperimentInputs> experimentsWithData = GeneRecordProcessor.processExpressionData(record, getExperimentPrompt, 0);
+    List<ExperimentInputs> experimentsWithData = GeneRecordProcessor.processExpressionData(record, aiChatModel, getExperimentPrompt, 0);
 
     return new GeneSummaryInputs() {
       @Override
@@ -74,22 +73,22 @@ public List<ExperimentInputs> getExperimentsWithData() {
 
       @Override
       public String getDigest() {
-	// Instead of building the final summary prompt using the AI-generated **summary outputs**
-	// (which happens during real processing), we construct it using JSON-encoded MD5
-	// **digests** of the per-experiment **inputs**.
-	//
-	// This avoids fetching per-experiment results from the cache while remaining 
-	// functionally identical for cache validation purposes.
-	List<JSONObject> digests = experimentsWithData.stream()
-	  .map(exp -> new JSONObject().put("digest", exp.getDigest()))
-	  .collect(Collectors.toList());
-	return EncryptionUtil.md5(getFinalSummaryPrompt.apply(digests));
+        // Instead of building the final summary prompt using the AI-generated **summary outputs**
+        // (which happens during real processing), we construct it using JSON-encoded MD5
+        // **digests** of the per-experiment **inputs**.
+        //
+        // This avoids fetching per-experiment results from the cache while remaining
+        // functionally identical for cache validation purposes.
+        List<JSONObject> digests = experimentsWithData.stream()
+            .map(exp -> new JSONObject().put("digest", exp.getDigest()))
+            .collect(Collectors.toList());
+        return EncryptionUtil.md5(aiChatModel + " " + getFinalSummaryPrompt.apply(digests));
       }
 
     };
   }
 
-  private static List<ExperimentInputs> processExpressionData(RecordInstance record, Function<JSONObject, String> getExperimentPrompt, int maxExperiments) throws WdkModelException {
+  private static List<ExperimentInputs> processExpressionData(RecordInstance record, String aiChatModel, Function<JSONObject, String> getExperimentPrompt, int maxExperiments) throws WdkModelException {
     try {
       // return value:
       List<ExperimentInputs> experiments = new ArrayList<>();
@@ -127,7 +126,7 @@ public String getCacheKey() {
 
           @Override
           public String getDigest() {
-            return EncryptionUtil.md5(getExperimentPrompt.apply(getExperimentData()));
+            return EncryptionUtil.md5(aiChatModel + " " + getExperimentPrompt.apply(getExperimentData()));
           }
 
           @Override
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 3207a0bd0..d4ff58fe4 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -26,8 +26,7 @@
 public class Summarizer {
 
   // provide exact model number for semi-reproducibility
-  // TODO: should this be incorporated into the digests, so if we change the chat model, all generated summaries become expired?
-  private static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
+  public static final ChatModel OPENAI_CHAT_MODEL = ChatModel.GPT_4O_2024_11_20; // GPT_4O_2024_08_06;
 
   private static final int MAX_RESPONSE_TOKENS = 10000;
     

From be790187b6945ccac69cdaad5b7df12b083ee64a Mon Sep 17 00:00:00 2001
From: Ryan Doherty <tech@conical.org>
Date: Sun, 23 Feb 2025 22:51:19 -0500
Subject: [PATCH 16/31] Reparallelize experiment lookups

---
 .../ai/expression/AiExpressionCache.java      | 51 +++++++++++++++----
 1 file changed, 42 insertions(+), 9 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
index da73895d3..8c65b7f33 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -1,5 +1,8 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
+import static java.util.concurrent.CompletableFuture.supplyAsync;
+import static org.gusdb.fgputil.functional.Functions.wrapException;
+
 import java.io.IOException;
 import java.nio.file.Files;
 import java.nio.file.Path;
@@ -8,6 +11,8 @@
 import java.util.List;
 import java.util.Optional;
 import java.util.concurrent.CompletableFuture;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
 import java.util.function.Predicate;
 
 import org.apache.log4j.Logger;
@@ -15,6 +20,7 @@
 import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.GeneSummaryInputs;
 import org.gusdb.fgputil.cache.disk.OnDiskCache;
 import org.gusdb.fgputil.cache.disk.OnDiskCache.EntryNotCreatedException;
+import org.gusdb.fgputil.functional.Either;
 import org.gusdb.fgputil.functional.FunctionalInterfaces.ConsumerWithException;
 import org.gusdb.fgputil.functional.FunctionalInterfaces.FunctionWithException;
 import org.gusdb.fgputil.functional.FunctionalInterfaces.PredicateWithException;
@@ -27,6 +33,9 @@ public class AiExpressionCache {
 
   private static Logger LOG = Logger.getLogger(AiExpressionCache.class);
 
+  // parallel processing
+  private static final int MAX_CONCURRENT_EXPERIMENT_LOOKUPS_PER_REQUEST = 5;
+
   // cache location
   private static final String CACHE_DIR_PROP_NAME = "AI_EXPRESSION_CACHE_DIR";
   private static final String DEFAULT_TMP_CACHE_SUBDIR = "expressionCache";
@@ -246,10 +255,17 @@ public JSONObject populateSummary(GeneSummaryInputs summaryInputs,
    */
   private List<JSONObject> populateExperiments(List<ExperimentInputs> experimentData,
       FunctionWithException<ExperimentInputs, CompletableFuture<JSONObject>> experimentDescriber) throws Exception {
-    List<JSONObject> experiments = new ArrayList<>();
-    // start with serial generation; move back to parallel later
-    for (ExperimentInputs input : experimentData) {
-      experiments.add(_cache.populateAndProcessContent(input.getCacheKey(),
+
+    // use a thread for each experiment, up to a reasonable max
+    int threadPoolSize = Math.min(MAX_CONCURRENT_EXPERIMENT_LOOKUPS_PER_REQUEST, experimentData.size());
+
+    ExecutorService exec = Executors.newFixedThreadPool(threadPoolSize);
+    try {
+      // look up experiment results in parallel, wait for completion, and aggregate results
+      List<CompletableFuture<JSONObject>> results = new ArrayList<>();
+      for (ExperimentInputs input : experimentData) {
+
+        results.add(supplyAsync(() -> wrapException(() -> _cache.populateAndProcessContent(input.getCacheKey(),
 
           // populator
           getPopulator(input.getDigest(), () -> experimentDescriber.apply(input).get()),
@@ -259,12 +275,29 @@ private List<JSONObject> populateExperiments(List<ExperimentInputs> experimentDa
 
           // repopulation predicate
           exceptionToTrue(experimentDir -> {
-              getValidStoredData(experimentDir, input.getDigest());
-              return false; // do not repopulate if able to look up valid value
-          })
-      ));
+            getValidStoredData(experimentDir, input.getDigest());
+            return false; // do not repopulate if able to look up valid value
+          }))
+
+        ), exec));
+      }
+
+      // wait for all threads, filling lists along the way
+      List<JSONObject> descriptors = new ArrayList<>();
+      List<Throwable> exceptions = new ArrayList<>();
+      for (CompletableFuture<JSONObject> result : results) {
+        result.handle(Either::new).get().ifLeft(descriptors::add).ifRight(exceptions::add);
+      }
+
+      // if no exceptions occurred, return results; else throw first problem
+      if (exceptions.isEmpty()) {
+        return descriptors;
+      }
+      throw new RuntimeException(exceptions.get(0));
+    }
+    finally {
+      exec.shutdown();
     }
-    return experiments;
   }
 
   /**

From 18f08707137ab97d6299c1af0eab17f8a070b2aa Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Mon, 24 Feb 2025 10:52:28 -0500
Subject: [PATCH 17/31] increase timeout and use ChatModel.toString() to fix
 exception

---
 .../model/report/ai/SingleGeneAiExpressionReporter.java         | 2 +-
 .../apicommon/model/report/ai/expression/AiExpressionCache.java | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index 07919351d..76a1d83e4 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -82,7 +82,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
 
         // create summary inputs
         GeneSummaryInputs summaryInputs =
-            GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer.OPENAI_CHAT_MODEL.asString(),
+            GeneRecordProcessor.getSummaryInputsFromRecord(record, Summarizer.OPENAI_CHAT_MODEL.toString(),
                 Summarizer::getExperimentMessage, Summarizer::getFinalSummaryMessage);
 
         // fetch summary, producing if necessary and requested
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
index 8c65b7f33..eae3a6dfc 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -41,7 +41,7 @@ public class AiExpressionCache {
   private static final String DEFAULT_TMP_CACHE_SUBDIR = "expressionCache";
 
   // catch characteristics
-  private static final long DEFAULT_TIMEOUT_MILLIS = 5000;
+  private static final long DEFAULT_TIMEOUT_MILLIS = 5 * 60 * 1000;
   private static final long DEFAULT_POLL_FREQUENCY_MILLIS = 500;
 
   // cache filenames

From a5799699455d03e62b007e07c328f16b8cd96730 Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Mon, 24 Feb 2025 11:47:31 -0500
Subject: [PATCH 18/31] getGeneId() fix

---
 .../model/report/ai/expression/GeneRecordProcessor.java         | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index c82dede69..6c5349953 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -51,7 +51,7 @@ public interface GeneSummaryInputs {
   }
 
   private static String getGeneId(RecordInstance record) {
-    return record.getPrimaryKey().getValues().get("gene_source_id");
+    return record.getPrimaryKey().getValues().get("source_id");
   }
 
   public static GeneSummaryInputs getSummaryInputsFromRecord(RecordInstance record, String aiChatModel, Function<JSONObject, String> getExperimentPrompt, Function<List<JSONObject>, String> getFinalSummaryPrompt) throws WdkModelException {

From 6693be51476c9883ebdb4451c16bacc5a670745c Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Mon, 24 Feb 2025 12:42:44 -0500
Subject: [PATCH 19/31] bugfix

---
 .../model/report/ai/SingleGeneAiExpressionReporter.java          | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
index 76a1d83e4..e840a8850 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/SingleGeneAiExpressionReporter.java
@@ -97,6 +97,7 @@ protected void write(OutputStream out) throws IOException, WdkModelException {
         writer.write("\"" + summaryInputs.getGeneId() + "\":" + expressionSummary.toString());
 
       }
+      writer.write("}");
     }
   }
 }

From 4f52d3796e6fb0d4e4d3b09edf30d3a42a8b7898 Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Tue, 25 Feb 2025 06:24:37 -0500
Subject: [PATCH 20/31] reworked summary prompt to avoid generalities and for
 clarity

---
 .../apicommon/model/report/ai/expression/Summarizer.java | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index d4ff58fe4..cd0cf1dd1 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -146,10 +146,13 @@ public CompletableFuture<JSONObject> describeExperiment(ExperimentInputs experim
 
   public static String getFinalSummaryMessage(List<JSONObject> experiments) {
 
-    return "Below are AI-generated summaries of a gene's behaviour in multiple transcriptomics experiments, provided in JSON format:\n\n" +
+    return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" +
         String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) +
-        "Provide a snappy headline and a one-paragraph summary of the gene's expression characteristics that gives the most biological insight into its function. Both are for human consumption on the gene page of our website. Also organise the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. Provide a headline and one-sentence summary for each section. These will also be shown to users. Wrap species names in `<i>` tags and use clear, scientific language accessible to non-native English speakers throughout your response.";
-
+        "Generate a one-paragraph summary (~100 words) describing the gene's expression. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\".\n\n" +
+        "Additionally, organize the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. For each section, provide:\n" +
+        "- A headline summarizing the section's key findings\n" +
+        "- A concise one-sentence summary of the experimental results\n\n" +
+        "These sections will be displayed to users. Wrap species names in `<i>` tags and use clear, precise scientific language accessible to non-native English speakers.";
   }
   
   public JSONObject summarizeExperiments(List<JSONObject> experiments) {

From 6a5a078158e57da6fb7349fde7e6ba42097f6f9d Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Tue, 25 Feb 2025 14:59:03 -0500
Subject: [PATCH 21/31] prompt for structured summary paragraph

---
 .../apicommon/model/report/ai/expression/Summarizer.java      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index cd0cf1dd1..f0ccf1865 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -148,11 +148,11 @@ public static String getFinalSummaryMessage(List<JSONObject> experiments) {
 
     return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" +
         String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) +
-        "Generate a one-paragraph summary (~100 words) describing the gene's expression. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\".\n\n" +
+        "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using <strong>, <ul>, and <li> tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" +
         "Additionally, organize the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. For each section, provide:\n" +
         "- A headline summarizing the section's key findings\n" +
         "- A concise one-sentence summary of the experimental results\n\n" +
-        "These sections will be displayed to users. Wrap species names in `<i>` tags and use clear, precise scientific language accessible to non-native English speakers.";
+        "These sections will be displayed to users. In all generated text, wrap species names in `<i>` tags and use clear, precise scientific language accessible to non-native English speakers.";
   }
   
   public JSONObject summarizeExperiments(List<JSONObject> experiments) {

From f35a43e29524ae04bfcb18a0cc030084519dc76b Mon Sep 17 00:00:00 2001
From: Ryan Doherty <tech@conical.org>
Date: Fri, 28 Feb 2025 03:25:45 -0500
Subject: [PATCH 22/31] Remove openai version (now in base pom)

---
 Model/pom.xml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Model/pom.xml b/Model/pom.xml
index 0d1bbe5dd..76e98617a 100644
--- a/Model/pom.xml
+++ b/Model/pom.xml
@@ -135,11 +135,9 @@
       <artifactId>Jackfish</artifactId>
     </dependency>
 
-    <!-- TO DO - when stable, move version to base-pom/pom.xml -->
     <dependency>
       <groupId>com.openai</groupId>
       <artifactId>openai-java</artifactId>
-      <version>0.22.0</version>
     </dependency>
 
   </dependencies>

From 6dd538058b03ec7bf8f8b084baaeb5d1b9d32028 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 11:00:01 +0000
Subject: [PATCH 23/31] pretty print JSON sent to the model

---
 .../apicommon/model/report/ai/expression/Summarizer.java      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index f0ccf1865..ecef898ca 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -101,7 +101,7 @@ public static String getExperimentMessage(JSONObject experiment) {
 
     // We don't need to send dataset_id to the AI but it's useful to have it
     // in the response for phase two
-    JSONObject experimentForAI = new JSONObject(experiment.toString()); // clone
+    JSONObject experimentForAI = new JSONObject(experiment.toString(2)); // clone
     experimentForAI.remove("dataset_id");
 
     return
@@ -147,7 +147,7 @@ public CompletableFuture<JSONObject> describeExperiment(ExperimentInputs experim
   public static String getFinalSummaryMessage(List<JSONObject> experiments) {
 
     return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" +
-        String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString()) +
+        String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) +
         "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using <strong>, <ul>, and <li> tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" +
         "Additionally, organize the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. For each section, provide:\n" +
         "- A headline summarizing the section's key findings\n" +

From 6607b0d03c21174ef7b4a314388d953b52bfc5b6 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 11:07:07 +0000
Subject: [PATCH 24/31] sections renamed to topics

---
 .../report/ai/expression/Summarizer.java      | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index ecef898ca..fcfac1d52 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -56,7 +56,7 @@ public class Summarizer {
     .putAdditionalProperty("properties", JsonValue.from(Map.of(
           "headline", Map.of("type", "string"),
           "one_paragraph_summary", Map.of("type", "string"),
-          "sections", Map.of("type", "array", "minimum", 1, "items", Map.of(
+          "topics", Map.of("type", "array", "minimum", 1, "items", Map.of(
               "type", "object",
               "required", List.of("headline", "one_sentence_summary", "dataset_ids"),
               "properties", Map.of(
@@ -149,10 +149,10 @@ public static String getFinalSummaryMessage(List<JSONObject> experiments) {
     return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" +
         String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) +
         "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using <strong>, <ul>, and <li> tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" +
-        "Additionally, organize the experimental results (identified by `dataset_id`) into sections, ordered by descending biological importance. For each section, provide:\n" +
-        "- A headline summarizing the section's key findings\n" +
-        "- A concise one-sentence summary of the experimental results\n\n" +
-        "These sections will be displayed to users. In all generated text, wrap species names in `<i>` tags and use clear, precise scientific language accessible to non-native English speakers.";
+    "Additionally, group the per-experiment summaries (identified by `dataset_id`) with `biological_importance > 3` and `confidence > 3` into sections by topic. For each topic, provide:\n" +
+    "- A headline summarizing the key experimental results within the topic\n" +
+    "- A concise one-sentence summary of the topic's experimental results\n\n" +
+    "These topics will be displayed to users. In all generated text, wrap species names in `<i>` tags and use clear, precise scientific language accessible to non-native English speakers."
   }
   
   public JSONObject summarizeExperiments(List<JSONObject> experiments) {
@@ -197,12 +197,12 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse,
     }
 
     Set<String> seenDatasetIds = new HashSet<>();
-    JSONArray deduplicatedSections = new JSONArray();
-    JSONArray sections = summaryResponse.getJSONArray("sections");
+    JSONArray deduplicatedTopics = new JSONArray();
+    JSONArray topics = summaryResponse.getJSONArray("topics");
 
-    for (int i = 0; i < sections.length(); i++) {
-      JSONObject section = sections.getJSONObject(i);
-      JSONArray datasetIds = section.getJSONArray("dataset_ids");
+    for (int i = 0; i < topics.length(); i++) {
+      JSONObject topic = topics.getJSONObject(i);
+      JSONArray datasetIds = topic.getJSONArray("dataset_ids");
       JSONArray summaries = new JSONArray();
 
       for (int j = 0; j < datasetIds.length(); j++) {
@@ -211,7 +211,7 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse,
         // Warn and skip if the id doesn't exist
         if (!datasetSummaries.containsKey(id)) {
           System.out.println(
-              "WARNING: summary section id '" + id + "' does not exist. Excluding from final output.");
+              "WARNING: dataset_id '" + id + "' does not exist. Excluding from final output.");
           continue;
         }
         // Skip if we've seen it
@@ -222,34 +222,34 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse,
         summaries.put(datasetSummaries.get(id));
       }
 
-      // Update section with mapped summaries and remove dataset_ids key
-      section.put("summaries", summaries);
-      section.remove("dataset_ids");
-      deduplicatedSections.put(section);
+      // Update topic with mapped summaries and remove dataset_ids key
+      topic.put("summaries", summaries);
+      topic.remove("dataset_ids");
+      deduplicatedTopics.put(topic);
     }
 
     // Find missing dataset IDs
     Set<String> missingDatasetIds = new HashSet<>(datasetSummaries.keySet());
     missingDatasetIds.removeAll(seenDatasetIds);
 
-    // If there are missing IDs, add an "Others" section
+    // If there are missing IDs, add an "Others" topic
     if (!missingDatasetIds.isEmpty()) {
       JSONArray otherSummaries = new JSONArray();
       for (String id : missingDatasetIds) {
         otherSummaries.put(datasetSummaries.get(id));
       }
 
-      JSONObject otherSection = new JSONObject();
-      otherSection.put("headline", "Other");
-      otherSection.put("one_sentence_summary",
-          "These experiments were not grouped into sub-sections by the AI.");
-      otherSection.put("summaries", otherSummaries);
-      deduplicatedSections.put(otherSection);
+      JSONObject otherTopic = new JSONObject();
+      otherTopic.put("headline", "Other");
+      otherTopic.put("one_sentence_summary",
+          "These experiments were not grouped by the AI.");
+      otherTopic.put("summaries", otherSummaries);
+      deduplicatedTopics.put(otherTopic);
     }
 
     // Create final deduplicated summary
     JSONObject finalSummary = new JSONObject(summaryResponse.toString());
-    finalSummary.put("sections", deduplicatedSections);
+    finalSummary.put("topics", deduplicatedTopics);
     return finalSummary;
   }
 

From 3d3625e409f4a89c0953971d9c2bc5661946abb8 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 11:24:17 +0000
Subject: [PATCH 25/31] add assay_type and experiment_name to first phase
 outputs to aid second phase

---
 .../ai/expression/GeneRecordProcessor.java       | 16 ++++++++++++++++
 .../model/report/ai/expression/Summarizer.java   |  3 +++
 2 files changed, 19 insertions(+)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index 6c5349953..d9317e587 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -36,6 +36,10 @@ public interface ExperimentInputs {
 
     String getDatasetId();
 
+    String getAssayType();
+
+    String getExperimentName();
+
     String getDigest();
 
     JSONObject getExperimentData();
@@ -107,6 +111,8 @@ private static List<ExperimentInputs> processExpressionData(RecordInstance recor
         }
 
         String datasetId = experimentRow.getAttributeValue("dataset_id").getValue();
+        String assayType = experimentRow.getAttributeValue("assay_type").getValue();
+        String experimentName = experimentRow.getAttributeValue("display_name").getValue();
 
         List<JSONObject> filteredData = readFilteredData(datasetId, expressionGraphsDataTable); 
 
@@ -118,6 +124,16 @@ private static List<ExperimentInputs> processExpressionData(RecordInstance recor
           public String getDatasetId() {
             return datasetId;
           }
+	    
+          @Override
+          public String getAssayType() {
+            return assayType;
+          }
+	    
+          @Override
+          public String getExperimentName() {
+            return experimentName;
+          }
 
           @Override
           public String getCacheKey() {
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index fcfac1d52..d7d86cd54 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -133,7 +133,10 @@ public CompletableFuture<JSONObject> describeExperiment(ExperimentInputs experim
       String jsonString = completion.choices().get(0).message().content().get();
       try {
         JSONObject jsonObject = new JSONObject(jsonString);
+	// add some fields directly to aid the final summarization
         jsonObject.put("dataset_id", experimentInputs.getDatasetId());
+        jsonObject.put("assay_type", experimentInputs.getAssayType());
+        jsonObject.put("experiment_name", experimentInputs.getExperimentName());
         return jsonObject;
       }
       catch (JSONException e) {

From e85ad1f2fe50fbab64d69b8b74eca4fea97193c4 Mon Sep 17 00:00:00 2001
From: Bob <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 11:52:41 +0000
Subject: [PATCH 26/31] sort second level inputs and add DATA_MODEL_VERSION for
 better cache control

---
 .../report/ai/expression/GeneRecordProcessor.java      |  8 ++++++--
 .../model/report/ai/expression/Summarizer.java         | 10 ++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index d9317e587..c4dc7c002 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -30,6 +30,10 @@ public class GeneRecordProcessor {
 
   public static final List<String> REQUIRED_TABLE_NAMES = List.of(EXPRESSION_GRAPH_TABLE, EXPRESSION_GRAPH_DATA_TABLE);
 
+  // Increment this to invalidate all previous cache entries:
+  // (for example if changing first level model outputs rather than inputs which are already digestified)
+  private static final String DATA_MODEL_VERSION = "v2";
+  
   public interface ExperimentInputs {
 
     String getCacheKey();
@@ -86,7 +90,7 @@ public String getDigest() {
         List<JSONObject> digests = experimentsWithData.stream()
             .map(exp -> new JSONObject().put("digest", exp.getDigest()))
             .collect(Collectors.toList());
-        return EncryptionUtil.md5(aiChatModel + " " + getFinalSummaryPrompt.apply(digests));
+        return EncryptionUtil.md5(aiChatModel + ":" + DATA_MODEL_VERSION + ":" + getFinalSummaryPrompt.apply(digests));
       }
 
     };
@@ -142,7 +146,7 @@ public String getCacheKey() {
 
           @Override
           public String getDigest() {
-            return EncryptionUtil.md5(aiChatModel + " " + getExperimentPrompt.apply(getExperimentData()));
+            return EncryptionUtil.md5(aiChatModel + ":" + DATA_MODEL_VERSION + ":" + getExperimentPrompt.apply(getExperimentData()));
           }
 
           @Override
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index d7d86cd54..aa6402727 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -148,9 +148,15 @@ public CompletableFuture<JSONObject> describeExperiment(ExperimentInputs experim
   }
 
   public static String getFinalSummaryMessage(List<JSONObject> experiments) {
-
+    
+    List<JSONObject> sortedExperiments =
+      experiments.sort(
+		       Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder())
+		       .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder())
+		       );
+    
     return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" +
-        String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) +
+        String.format("```json\n%s\n```\n\n", new JSONArray(sortedExperiments).toString(2)) +
         "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using <strong>, <ul>, and <li> tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" +
     "Additionally, group the per-experiment summaries (identified by `dataset_id`) with `biological_importance > 3` and `confidence > 3` into sections by topic. For each topic, provide:\n" +
     "- A headline summarizing the key experimental results within the topic\n" +

From dada1a5b04e456d4e0232501f9517a1f9f04ff8f Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 07:24:40 -0500
Subject: [PATCH 27/31] increase concurrency and fix bugs

---
 .../report/ai/expression/AiExpressionCache.java    |  2 +-
 .../model/report/ai/expression/Summarizer.java     | 14 +++++++-------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
index eae3a6dfc..3ea0a6dc1 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -34,7 +34,7 @@ public class AiExpressionCache {
   private static Logger LOG = Logger.getLogger(AiExpressionCache.class);
 
   // parallel processing
-  private static final int MAX_CONCURRENT_EXPERIMENT_LOOKUPS_PER_REQUEST = 5;
+  private static final int MAX_CONCURRENT_EXPERIMENT_LOOKUPS_PER_REQUEST = 10;
 
   // cache location
   private static final String CACHE_DIR_PROP_NAME = "AI_EXPRESSION_CACHE_DIR";
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index aa6402727..23860cb59 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -6,6 +6,7 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.CompletableFuture;
+import java.util.Comparator;
 
 import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.ExperimentInputs;
 import org.gusdb.fgputil.json.JsonUtil;
@@ -149,19 +150,18 @@ public CompletableFuture<JSONObject> describeExperiment(ExperimentInputs experim
 
   public static String getFinalSummaryMessage(List<JSONObject> experiments) {
     
-    List<JSONObject> sortedExperiments =
-      experiments.sort(
-		       Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder())
-		       .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder())
-		       );
+    experiments.sort(
+		     Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder())
+		     .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder())
+		     );
     
     return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" +
-        String.format("```json\n%s\n```\n\n", new JSONArray(sortedExperiments).toString(2)) +
+        String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) +
         "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using <strong>, <ul>, and <li> tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" +
     "Additionally, group the per-experiment summaries (identified by `dataset_id`) with `biological_importance > 3` and `confidence > 3` into sections by topic. For each topic, provide:\n" +
     "- A headline summarizing the key experimental results within the topic\n" +
     "- A concise one-sentence summary of the topic's experimental results\n\n" +
-    "These topics will be displayed to users. In all generated text, wrap species names in `<i>` tags and use clear, precise scientific language accessible to non-native English speakers."
+    "These topics will be displayed to users. In all generated text, wrap species names in `<i>` tags and use clear, precise scientific language accessible to non-native English speakers.";
   }
   
   public JSONObject summarizeExperiments(List<JSONObject> experiments) {

From 5c20b3238bb1cf73d5cc28d704a82c185a794e38 Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 08:10:59 -0500
Subject: [PATCH 28/31] apply experiment summary reporting in proper place

---
 .../model/report/ai/expression/AiExpressionCache.java     | 8 ++++++++
 .../apicommon/model/report/ai/expression/Summarizer.java  | 6 ------
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
index 3ea0a6dc1..c7f108fb8 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -10,6 +10,7 @@
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Optional;
+import java.util.Comparator;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
@@ -225,6 +226,13 @@ public JSONObject populateSummary(GeneSummaryInputs summaryInputs,
             // first populate each dataset entry as needed and collect experiment descriptors
             List<JSONObject> experiments = populateExperiments(summaryInputs.getExperimentsWithData(), experimentDescriber);
 
+	    // sort them most-interesting first so that the "Other" section will be filled
+	    // in that order (and also to give the AI the data in a sensible order)
+	    experiments.sort(
+			     Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder())
+			     .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder())
+			     );
+    
             // summarize experiments and store
             getPopulator(summaryInputs.getDigest(), () -> experimentSummarizer.apply(experiments)).accept(entryDir);
           },
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 23860cb59..daa5e3d39 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -6,7 +6,6 @@
 import java.util.Map;
 import java.util.Set;
 import java.util.concurrent.CompletableFuture;
-import java.util.Comparator;
 
 import org.apidb.apicommon.model.report.ai.expression.GeneRecordProcessor.ExperimentInputs;
 import org.gusdb.fgputil.json.JsonUtil;
@@ -150,11 +149,6 @@ public CompletableFuture<JSONObject> describeExperiment(ExperimentInputs experim
 
   public static String getFinalSummaryMessage(List<JSONObject> experiments) {
     
-    experiments.sort(
-		     Comparator.comparing((JSONObject obj) -> obj.optInt("biological_importance"), Comparator.reverseOrder())
-		     .thenComparing(obj -> obj.optInt("confidence"), Comparator.reverseOrder())
-		     );
-    
     return "Below are AI-generated summaries of one gene's behavior in all the transcriptomics experiments available in VEuPathDB, provided in JSON format:\n\n" +
         String.format("```json\n%s\n```\n\n", new JSONArray(experiments).toString(2)) +
         "Generate a one-paragraph summary (~100 words) describing the gene's expression. Structure it using <strong>, <ul>, and <li> tags with no attributes. If relevant, briefly speculate on the gene's potential function, but only if justified by the data. Also, generate a short, specific headline for the summary. The headline must reflect this gene's expression and **must not** include generic phrases like \"comprehensive insights into\" or the word \"gene\".\n\n" +

From 5bd73448b7f2127f6630c076a90327ee6cf31554 Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 08:17:11 -0500
Subject: [PATCH 29/31] Other topic section wording improved

---
 .../apidb/apicommon/model/report/ai/expression/Summarizer.java  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index daa5e3d39..359212bc4 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -245,7 +245,7 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse,
       JSONObject otherTopic = new JSONObject();
       otherTopic.put("headline", "Other");
       otherTopic.put("one_sentence_summary",
-          "These experiments were not grouped by the AI.");
+          "The AI ordered these experiments by biological importance but did not group them into topics.");
       otherTopic.put("summaries", otherSummaries);
       deduplicatedTopics.put(otherTopic);
     }

From ac11cafbfa146715e302953a0bf890ecd3cbfd05 Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 09:05:34 -0500
Subject: [PATCH 30/31] banish empty topics

---
 .../model/report/ai/expression/GeneRecordProcessor.java  | 2 +-
 .../apicommon/model/report/ai/expression/Summarizer.java | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index c4dc7c002..c18080da3 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -32,7 +32,7 @@ public class GeneRecordProcessor {
 
   // Increment this to invalidate all previous cache entries:
   // (for example if changing first level model outputs rather than inputs which are already digestified)
-  private static final String DATA_MODEL_VERSION = "v2";
+  private static final String DATA_MODEL_VERSION = "v3";
   
   public interface ExperimentInputs {
 
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index 359212bc4..e11957295 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -226,9 +226,12 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse,
       }
 
       // Update topic with mapped summaries and remove dataset_ids key
-      topic.put("summaries", summaries);
-      topic.remove("dataset_ids");
-      deduplicatedTopics.put(topic);
+      // but only if it's a non-empty topic (can happen with bad dataset_ids, see above)
+      if (summaries.length() > 0) {
+	topic.put("summaries", summaries);
+	topic.remove("dataset_ids");
+        deduplicatedTopics.put(topic);
+      }
     }
 
     // Find missing dataset IDs

From 27fa586b803658101aea47d1e1a806e9baa54537 Mon Sep 17 00:00:00 2001
From: Bob MacCallum <uncoolbob@gmail.com>
Date: Fri, 28 Feb 2025 11:57:30 -0500
Subject: [PATCH 31/31] preserve sort order during consolidation step

---
 .../report/ai/expression/AiExpressionCache.java   |  1 +
 .../report/ai/expression/GeneRecordProcessor.java |  2 +-
 .../model/report/ai/expression/Summarizer.java    | 15 ++++++++-------
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
index c7f108fb8..688a8dcb3 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/AiExpressionCache.java
@@ -11,6 +11,7 @@
 import java.util.List;
 import java.util.Optional;
 import java.util.Comparator;
+import java.util.stream.Collectors;
 import java.util.concurrent.CompletableFuture;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
index c18080da3..26fa39bab 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/GeneRecordProcessor.java
@@ -32,7 +32,7 @@ public class GeneRecordProcessor {
 
   // Increment this to invalidate all previous cache entries:
   // (for example if changing first level model outputs rather than inputs which are already digestified)
-  private static final String DATA_MODEL_VERSION = "v3";
+  private static final String DATA_MODEL_VERSION = "v3b";
   
   public interface ExperimentInputs {
 
diff --git a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
index e11957295..7d4e7c009 100644
--- a/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
+++ b/Model/src/main/java/org/apidb/apicommon/model/report/ai/expression/Summarizer.java
@@ -1,7 +1,7 @@
 package org.apidb.apicommon.model.report.ai.expression;
 
-import java.util.HashMap;
-import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
@@ -193,13 +193,14 @@ public JSONObject summarizeExperiments(List<JSONObject> experiments) {
 
   private static JSONObject consolidateSummary(JSONObject summaryResponse,
       List<JSONObject> individualResults) {
-    // Gather all dataset IDs from individualResults and map them to summaries
-    Map<String, JSONObject> datasetSummaries = new HashMap<>();
+    // Gather all dataset IDs from individualResults and map them to summaries.
+    // Preserving the order of individualResults.
+    Map<String, JSONObject> datasetSummaries = new LinkedHashMap<>();
     for (JSONObject result : individualResults) {
       datasetSummaries.put(result.getString("dataset_id"), result);
     }
 
-    Set<String> seenDatasetIds = new HashSet<>();
+    Set<String> seenDatasetIds = new LinkedHashSet<>();
     JSONArray deduplicatedTopics = new JSONArray();
     JSONArray topics = summaryResponse.getJSONArray("topics");
 
@@ -234,8 +235,8 @@ private static JSONObject consolidateSummary(JSONObject summaryResponse,
       }
     }
 
-    // Find missing dataset IDs
-    Set<String> missingDatasetIds = new HashSet<>(datasetSummaries.keySet());
+    // Find missing dataset IDs (preserve dataset order)
+    Set<String> missingDatasetIds = new LinkedHashSet<>(datasetSummaries.keySet());
     missingDatasetIds.removeAll(seenDatasetIds);
 
     // If there are missing IDs, add an "Others" topic