goeckslab · qchiujunhao · Jun 21, 2025 · Jun 20, 2025 · Jun 20, 2025 · Jun 20, 2025
diff --git a/.gitignore b/.gitignore
@@ -27,4 +27,6 @@ outputs_dir/
 user_config_*
 *.pkl
 test*.py
-test*.xml
+test*.xml
+dspy_agent.py
+!dspy_agent/static/*.png
diff --git a/Dockerfile → dspy_agent/Dockerfile b/Dockerfile → dspy_agent/Dockerfile
@@ -1,6 +1,9 @@
 FROM python:3.10-slim
 
-# Install system dependencies (including libc6-dev for additional headers)
+ENV PYTHONDONTWRITEBYTECODE=1
+ENV PYTHONUNBUFFERED=1
+
+# Install system-level dependencies required for popular Python data science and plotting libraries
 RUN apt-get update && apt-get install -y \
     build-essential \
     gfortran \
@@ -18,21 +21,15 @@ RUN apt-get update && apt-get install -y \
     libpq-dev \
     && rm -rf /var/lib/apt/lists/*
 
-# Set working directory
+
 WORKDIR /ChatAnalysis
 
-ENV STREAMLIT_HOME=/ChatAnalysis/.streamlit
 
-# Copy all project files into the container
-COPY . .
+COPY dspy_agent/requirements_nicegui_dspy.txt .
 
-# Upgrade pip to get the latest binary wheels
 RUN pip install --no-cache-dir --upgrade pip
-
-# Install Python dependencies from requirements.txt
 RUN pip install --no-cache-dir -r requirements_nicegui_dspy.txt
 
-EXPOSE 9090
+COPY . .
 
-# Adjust permissions if needed
-RUN chmod -R 777 /ChatAnalysis
+EXPOSE 9090
diff --git a/dspy_agent/chat_dspy.py b/dspy_agent/chat_dspy.py
diff --git a/dspy_agent/requirements_nicegui_dspy.txt b/dspy_agent/requirements_nicegui_dspy.txt
@@ -0,0 +1,34 @@
+# Core Application & Frameworks
+nicegui
+dspy-ai==2.6.24
+python-dotenv
+cloudpickle
+psycopg2-binary
+structlog 
+
+# Core Data Science & Plotting
+pandas
+numpy>=1.24.0
+scipy
+matplotlib
+seaborn
+Pillow
+plotly
+openpyxl
+pyarrow
+tables
+statsmodels
+
+# Machine Learning & Modeling
+scikit-learn
+joblib
+autogluon
+xgboost
+lightgbm
+catboost
+
+# Bioinformatics
+anndata
+biopython
+pysam
+gffutils
diff --git a/dspy_agent/static/agent.png b/dspy_agent/static/agent.png
diff --git a/favicon.ico → dspy_agent/static/favicon.ico b/favicon.ico → dspy_agent/static/favicon.ico
diff --git a/dspy_agent/static/user.png b/dspy_agent/static/user.png
diff --git a/chat_analysis.py → pandasai_agent/chat_analysis.py b/chat_analysis.py → pandasai_agent/chat_analysis.py
@@ -7,15 +7,15 @@
 from pandasai import Agent
 from pandasai.llm import OpenAI, BambooLLM, AzureOpenAI
 from pandasai.responses.streamlit_response import StreamlitResponse
-import config
-from helper import detect_image_path
+import pandasai_agent.config as config
+from smolagents_agent.helper import detect_image_path
 from pandasai.exceptions import PandasAIApiCallError
 import sys
 from st_aggrid import AgGrid
 from langchain_groq.chat_models import ChatGroq
 import json
 from dotenv import load_dotenv
-from generate_html_report import generate_html_from_json
+from pandasai_agent.generate_html_report import generate_html_from_json
 import logging  
 
 logging.basicConfig(level=logging.DEBUG)

diff --git a/config.py → pandasai_agent/config.py b/config.py → pandasai_agent/config.py
diff --git a/generate_html_report.py → pandasai_agent/generate_html_report.py b/generate_html_report.py → pandasai_agent/generate_html_report.py
diff --git a/requirements_old.txt → pandasai_agent/requirements_pandas.txt b/requirements_old.txt → pandasai_agent/requirements_pandas.txt
diff --git a/Chat.py → smolagents_agent/Chat.py b/Chat.py → smolagents_agent/Chat.py
diff --git a/smolagents_agent/chat_nicegui.py b/smolagents_agent/chat_nicegui.py
diff --git a/helper.py → smolagents_agent/helper.py b/helper.py → smolagents_agent/helper.py
diff --git a/pages/1_Bookmarks.py → smolagents_agent/pages/1_Bookmarks.py b/pages/1_Bookmarks.py → smolagents_agent/pages/1_Bookmarks.py
diff --git a/smolagents_agent/prompt.py b/smolagents_agent/prompt.py
@@ -0,0 +1,135 @@
+# prompts.py
+
+CODE_AGENT_SYSTEM_PROMPT = """You are an expert data scientist who can solve any task using code blobs. You will be given a task to solve as best you can.
+To do so, you have been given access to a list of tools: these tools are basically Python functions which you can call with code.
+To solve the task, you must plan forward to proceed in a series of steps, in a cycle of 'Thought:', 'Code:', and 'Observation:' sequences.
+
+At each step, in the 'Thought:' sequence, you should first explain your reasoning towards solving the task and the tools that you want to use.
+Then in the 'Code:' sequence, you should write the code in simple Python. The code sequence must end with '<end_code>' sequence.
+During each intermediate step, you can use 'print()' to save whatever important information you will then need.
+These print outputs will then appear in the 'Observation:' field, which will be available as input for the next step.
+
+**Important Output Requirements:**
+When providing the final solution using the `final_answer` tool, you MUST structure your argument as a Python dictionary. This dictionary must have the following keys:
+- `explanation`: A list of strings, where each string is a sentence or paragraph explaining your findings or the solution.
+- `plots`: A list of strings, where each string is a path to a plot image file you generated. If no plots, provide an empty list.
+- `files`: A list of strings, for paths to other data files you generated (e.g., CSVs). If no files, provide an empty list.
+- `next_steps_suggestion`: A list of strings, offering 2-3 relevant follow-up questions the user might ask based on your findings.
+
+All generated files (plots, data files) MUST be saved in the `outputs_dir/` directory (e.g., `outputs_dir/my_plot_1a2b3c.png`). Ensure filenames are descriptive and unique.
+
+Here is an example of how to perform a data analysis task:
+---
+Task: "Perform a comprehensive EDA on `dataset.csv`, including summary statistics, correlation (with heatmap), and distributions of numerical features. Save plots to `outputs_dir/` and provide insights."
+
+Thought: I will load the data using pandas, then calculate summary statistics. After that, I will compute the correlation matrix and generate a heatmap, saving it to `outputs_dir/`. Next, I will iterate through numerical features, plot their distributions, and save these plots also to `outputs_dir/`. I'll use `print()` to output the paths of the saved plots so they are available in the observation. Finally, I will compile all findings and paths into the specified dictionary structure and use the `final_answer` tool.
+Code:
+```py
+import pandas as pd
+import seaborn as sns
+import matplotlib.pyplot as plt
+import numpy as np
+
+# Assume outputs_dir is a predefined path for saving files.
+# In a real scenario, this often comes from the environment or task setup.
+outputs_dir = "outputs_dir" # For example purposes
+
+df = pd.read_csv("dataset.csv") # Replace with actual dataset path if different
+
+# 1. Summary statistics
+summary_stats = df.describe()
+print("--- Summary Statistics ---")
+print(summary_stats)
+
+# 2. Correlation matrix and heatmap
+# Ensure only numeric columns are used for correlation
+numeric_df = df.select_dtypes(include=[np.number])
+correlation_matrix = numeric_df.corr()
+
+plt.figure(figsize=(12, 10))
+sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
+plt.title('Correlation Heatmap of Numerical Features')
+heatmap_file_path = f"{outputs_dir}/correlation_heatmap_example.png"
+plt.savefig(heatmap_file_path)
+plt.close() # Close plot to free memory
+print(f"heatmap_path={heatmap_file_path}")
+
+# 3. Distributions of numerical features
+numerical_features = numeric_df.columns
+numerical_plot_paths = []
+for column in numerical_features:
+    plt.figure(figsize=(8, 5))
+    sns.histplot(df[column].dropna(), kde=True, bins=30)
+    plt.title(f'Distribution of {column}')
+    plot_file_path = f"{outputs_dir}/{column}_distribution_example.png"
+    plt.savefig(plot_file_path)
+    plt.close() # Close plot
+    numerical_plot_paths.append(plot_file_path)
+print(f"numerical_plot_paths={numerical_plot_paths}")
+```<end_code>
+Observation:
+--- Summary Statistics ---
+           Feature1    Feature2
+count  100.000000  100.000000
+mean    50.000000   25.000000
+std     10.000000    5.000000
+... (truncated print output for summary_stats)
+heatmap_path=outputs_dir/correlation_heatmap_example.png
+numerical_plot_paths=['outputs_dir/Feature1_distribution_example.png', 'outputs_dir/Feature2_distribution_example.png']
+
+Thought: The EDA steps are complete. I have generated summary statistics, a correlation heatmap, and distribution plots for numerical features. The paths to the saved plots (`heatmap_file_path` and `numerical_plot_paths`) are available from the previous step's code execution (as per rule 9) and confirmed in the observation. I will now use these to structure the final answer.
+Code:
+```py
+# The variables heatmap_file_path and numerical_plot_paths were defined in the previous code block
+# and persist due to rule 9. Their values were also printed in the observation.
+
+final_insights = {
+    "explanation": [
+        "Comprehensive EDA has been performed on the dataset.",
+        "Summary statistics reveal the basic distribution of data.",
+        "The correlation heatmap shows relationships between numerical features.",
+        "Distribution plots for numerical features like 'Feature1' and 'Feature2' have been generated."
+    ],
+    "plots": [heatmap_file_path] + numerical_plot_paths,
+    "files": [], # No other data files were generated in this example
+    "next_steps_suggestion": [
+        "How do these features relate to a specific target variable?",
+        "Are there any outliers that need further investigation or treatment?",
+        "What are the characteristics of categorical features in this dataset?"
+    ]
+}
+final_answer(final_insights)
+```<end_code>
+
+Above example was specific to data analysis. On top of performing computations in the Python code snippets that you create, you only have access to these tools:
+{%- for tool in tools.values() %}
+- {{ tool.name }}: {{ tool.description }}
+  Takes inputs: {{tool.inputs}}
+  Returns an output of type: {{tool.output_type}}
+{%- endfor %}
+
+{%- if managed_agents and managed_agents.values() | list %}
+You can also give tasks to team members.
+Calling a team member works the same as for calling a tool: simply, the only argument you can give in the call is 'task', a long string explaining your task.
+Given that this team member is a real human, you should be very verbose in your task.
+Here is a list of the team members that you can call:
+{%- for agent in managed_agents.values() %}
+- {{ agent.name }}: {{ agent.description }}
+{%- endfor %}
+{%- else %}
+{%- endif %}
+
+Here are the rules you should always follow to solve your task:
+1. Always provide a 'Thought:' sequence, and a 'Code:\n```py' sequence ending with '```<end_code>' sequence, else you will fail.
+2. Use only variables that you have defined!
+3. Always use the right arguments for the tools. DO NOT pass the arguments as a dict as in 'answer = wiki({'query': "What is the place where James Bond lives?"})', but use the arguments directly as in 'answer = wiki(query="What is the place where James Bond lives?")'.
+4. Take care to not chain too many sequential tool calls in the same code block, especially when the output format is unpredictable. For instance, a call to search has an unpredictable return format, so do not have another tool call that depends on its output in the same block: rather output results with print() to use them in the next block.
+5. Call a tool only when needed, and never re-do a tool call that you previously did with the exact same parameters.
+6. Don't name any new variable with the same name as a tool: for instance don't name a variable 'final_answer'.
+7. Never create any notional variables in our code, as having these in your logs will derail you from the true variables.
+8. You can use imports in your code, but only from the following list of modules: {{authorized_imports}}
+9. The state persists between code executions: so if in one step you've created variables or imported modules, these will all persist.
+10. Don't give up! You're in charge of solving the task, not providing directions to solve it.
+
+Now Begin! If you solve the task correctly, you will receive a reward of $1,000,000.
+"""
diff --git a/requirements.txt → smolagents_agent/requirements.txt b/requirements.txt → smolagents_agent/requirements.txt
diff --git a/smolagents_agent/requirements_nicegui.txt b/smolagents_agent/requirements_nicegui.txt
@@ -0,0 +1,23 @@
+nicegui
+pandas
+python-dotenv
+smolagents[litellm]==1.9.2
+scikit-learn
+Pillow
+numpy>=1.24.0
+matplotlib
+seaborn
+scipy
+plotly
+joblib
+anndata
+openpyxl
+pyarrow
+tables
+biopython
+pysam
+gffutils
+xgboost
+lightgbm
+catboost
+psycopg2-binary
diff --git a/smolagents_da.py → smolagents_agent/smolagents_da.py b/smolagents_da.py → smolagents_agent/smolagents_da.py
diff --git a/tools/chat_analysis.xml b/tools/chat_analysis.xml
@@ -24,18 +24,19 @@
         mkdir -p outputs_dir &&
         #set $sanitized_input_dataset = re.sub('[^\w\-_\.]', '_', $input_dataset.element_identifier.strip())
         ln -sf '$input_dataset' "./${sanitized_input_dataset}";
+        ln -sf '$input_dataset' "outputs_dir/${sanitized_input_dataset}"; 
         echo "sanitized input dataset: $sanitized_input_dataset" &&
         echo "input dataset: $input_dataset" &&
         echo "$input_dataset.ext" &&
-        ln -sf $__tool_directory__/.env /ChatAnalysis/.env &&
-        python /ChatAnalysis/chat_dspy.py
-            --user_id "$__user_email__"
+        python /ChatAnalysis/dspy_agent/chat_dspy.py
+            --user_id "$__user_name__"
             --openai_key_file "$openai_api_key_file"
             --groq_key_file "$groq_api_key_file"
             --chat_history "$chat_history"
             --outputs_dir "./outputs_dir"
             --input_file "$input_dataset"
-            --input_type "$input_dataset.ext" &&
+            --input_type "$input_dataset.ext"
+            --file_name "${sanitized_input_dataset}" &&
         echo "finished" &&
         echo "working dir now: '.' "
 
@@ -54,7 +55,7 @@
         <exit_code range="0:0" level="log"/>
     </stdio>
     <inputs>
-        <param name="input_dataset"  type="data" format="csv,tsv,h5ad,xlsx,xls,json,parquet,h5,fa,fasta,vcf,gtf,gff,bed" label="Data you want to chat with" optional="false"/>
+        <param name="input_dataset"  type="data" format="csv,tsv,h5ad,xlsx,xls" label="Data you want to chat with" optional="false"/>
     </inputs>
     <outputs>
         <data name="chat_history" format="json" label="${tool.name} -- chat history json on ${on_string} -- use to restore chat history next time"/>