Skip to content

Fix issue #6262: [Bug]: "Success/failure" behavior is not consistent for file reading and editing #6263

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 11 additions & 5 deletions evaluation/benchmarks/swe_bench/eval_infer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
import subprocess
import tempfile
import time
from functools import partial
Expand Down Expand Up @@ -27,6 +28,7 @@
)
from openhands.core.config import (
AppConfig,
LLMConfig,
SandboxConfig,
get_parser,
)
Expand Down Expand Up @@ -415,13 +417,17 @@ def process_instance(
else:
# Initialize with a dummy metadata when file doesn't exist
metadata = EvalMetadata(
agent_class="dummy_agent", # Placeholder agent class
llm_config=LLMConfig(model="dummy_model"), # Minimal LLM config
agent_class='dummy_agent', # Placeholder agent class
llm_config=LLMConfig(model='dummy_model'), # Minimal LLM config
max_iterations=1, # Minimal iterations
eval_output_dir=os.path.dirname(args.input_file), # Use input file dir as output dir
eval_output_dir=os.path.dirname(
args.input_file
), # Use input file dir as output dir
start_time=time.strftime('%Y-%m-%d %H:%M:%S'), # Current time
git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode('utf-8').strip(), # Current commit
dataset=args.dataset # Dataset name from args
git_commit=subprocess.check_output(['git', 'rev-parse', 'HEAD'])
.decode('utf-8')
.strip(), # Current commit
dataset=args.dataset, # Dataset name from args
)

# The evaluation harness constrains the signature of `process_instance_func` but we need to
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { afterEach, beforeAll, describe, expect, it, vi } from "vitest";
import type { Message } from "#/message";
import { act, screen, waitFor, within } from "@testing-library/react";
import userEvent from "@testing-library/user-event";
import { renderWithProviders } from "test-utils";
Expand Down
82 changes: 82 additions & 0 deletions frontend/__tests__/components/file-operations.test.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
import { render, screen } from "@testing-library/react";
import { describe, it, expect } from "vitest";
import { Messages } from "#/components/features/chat/messages";
import type { Message } from "#/message";

describe("File Operations Messages", () => {
it("should show success indicator for successful file read operation", () => {
const messages: Message[] = [
{
type: "action",
translationID: "read_file_contents",
content: "Successfully read file contents",
success: true,
sender: "assistant",
timestamp: new Date().toISOString(),
},
];

render(<Messages messages={messages} isAwaitingUserConfirmation={false} />);

const statusIcon = screen.getByTestId("status-icon");
expect(statusIcon).toBeInTheDocument();
expect(statusIcon.closest("svg")).toHaveClass("fill-success");
});

it("should show failure indicator for failed file read operation", () => {
const messages: Message[] = [
{
type: "action",
translationID: "read_file_contents",
content: "Failed to read file contents",
success: false,
sender: "assistant",
timestamp: new Date().toISOString(),
},
];

render(<Messages messages={messages} isAwaitingUserConfirmation={false} />);

const statusIcon = screen.getByTestId("status-icon");
expect(statusIcon).toBeInTheDocument();
expect(statusIcon.closest("svg")).toHaveClass("fill-danger");
});

it("should show success indicator for successful file edit operation", () => {
const messages: Message[] = [
{
type: "action",
translationID: "edit_file_contents",
content: "Successfully edited file contents",
success: true,
sender: "assistant",
timestamp: new Date().toISOString(),
},
];

render(<Messages messages={messages} isAwaitingUserConfirmation={false} />);

const statusIcon = screen.getByTestId("status-icon");
expect(statusIcon).toBeInTheDocument();
expect(statusIcon.closest("svg")).toHaveClass("fill-success");
});

it("should show failure indicator for failed file edit operation", () => {
const messages: Message[] = [
{
type: "action",
translationID: "edit_file_contents",
content: "Failed to edit file contents",
success: false,
sender: "assistant",
timestamp: new Date().toISOString(),
},
];

render(<Messages messages={messages} isAwaitingUserConfirmation={false} />);

const statusIcon = screen.getByTestId("status-icon");
expect(statusIcon).toBeInTheDocument();
expect(statusIcon.closest("svg")).toHaveClass("fill-danger");
});
});
54 changes: 27 additions & 27 deletions frontend/src/components/features/chat/expandable-message.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -46,15 +46,15 @@ export function ExpandableMessage({
)}
>
<div className="text-sm w-full">
{headline && (
<div className="flex flex-row justify-between items-center w-full">
<span
className={cn(
"font-bold",
type === "error" ? "text-danger" : "text-neutral-300",
)}
>
{headline}
<div className="flex flex-row justify-between items-center w-full">
<span
className={cn(
headline ? "font-bold" : "",
type === "error" ? "text-danger" : "text-neutral-300",
)}
>
{headline}
{headline && (
<button
type="button"
onClick={() => setShowDetails(!showDetails)}
Expand All @@ -76,25 +76,25 @@ export function ExpandableMessage({
/>
)}
</button>
</span>
{type === "action" && success !== undefined && (
<span className="flex-shrink-0">
{success ? (
<CheckCircle
data-testid="status-icon"
className={cn(statusIconClasses, "fill-success")}
/>
) : (
<XCircle
data-testid="status-icon"
className={cn(statusIconClasses, "fill-danger")}
/>
)}
</span>
)}
</div>
)}
{showDetails && (
</span>
{type === "action" && success !== undefined && (
<span className="flex-shrink-0">
{success ? (
<CheckCircle
data-testid="status-icon"
className={cn(statusIconClasses, "fill-success")}
/>
) : (
<XCircle
data-testid="status-icon"
className={cn(statusIconClasses, "fill-danger")}
/>
)}
</span>
)}
</div>
{(!headline || showDetails) && (
<Markdown
className="text-sm overflow-auto"
components={{
Expand Down
1 change: 1 addition & 0 deletions frontend/src/components/features/chat/messages.tsx
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import React from "react";
import type { Message } from "#/message";
import { ChatMessage } from "#/components/features/chat/chat-message";
import { ConfirmationButtons } from "#/components/shared/buttons/confirmation-buttons";
import { ImageCarousel } from "../images/image-carousel";
Expand Down
2 changes: 1 addition & 1 deletion frontend/src/message.d.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
type Message = {
export type Message = {
sender: "user" | "assistant";
content: string;
timestamp: string;
Expand Down
10 changes: 10 additions & 0 deletions frontend/src/state/chat-slice.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import { createSlice, PayloadAction } from "@reduxjs/toolkit";
import type { Message } from "#/message";

import { ActionSecurityRisk } from "#/state/security-analyzer-slice";
import {
Expand Down Expand Up @@ -107,6 +108,10 @@ export const chatSlice = createSlice({
text = `${action.payload.args.path}\n${content}`;
} else if (actionID === "browse") {
text = `Browsing ${action.payload.args.url}`;
} else if (actionID === "read") {
text = `Reading file ${action.payload.args.path}`;
} else if (actionID === "edit") {
text = `Editing file ${action.payload.args.path}`;
}
if (actionID === "run" || actionID === "run_ipython") {
if (
Expand Down Expand Up @@ -154,6 +159,11 @@ export const chatSlice = createSlice({
causeMessage.success = !ipythonObs.content
.toLowerCase()
.includes("error:");
} else if (observationID === "read" || observationID === "edit") {
// For read/edit operations, we consider it successful if there's content and no error
causeMessage.success =
observation.payload.content.length > 0 &&
!observation.payload.content.toLowerCase().includes("error:");
}

if (observationID === "run" || observationID === "run_ipython") {
Expand Down
12 changes: 6 additions & 6 deletions openhands/runtime/impl/docker/docker_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -405,11 +405,11 @@ def pause(self):
"""Pause the runtime by stopping the container.
This is different from container.stop() as it ensures environment variables are properly preserved."""
if not self.container:
raise RuntimeError("Container not initialized")
raise RuntimeError('Container not initialized')

# First, ensure all environment variables are properly persisted in .bashrc
# This is already handled by add_env_vars in base.py

# Stop the container
self.container.stop()
self.log('debug', f'Container {self.container_name} paused')
Expand All @@ -418,12 +418,12 @@ def resume(self):
"""Resume the runtime by starting the container.
This is different from container.start() as it ensures environment variables are properly restored."""
if not self.container:
raise RuntimeError("Container not initialized")
raise RuntimeError('Container not initialized')

# Start the container
self.container.start()
self.log('debug', f'Container {self.container_name} resumed')

# Wait for the container to be ready
self._wait_until_alive()

Expand Down
5 changes: 4 additions & 1 deletion openhands/runtime/impl/modal/modal_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,10 @@ def vscode_url(self) -> str | None:

tunnel = self.sandbox.tunnels()[self._vscode_port]
tunnel_url = tunnel.url
self._vscode_url = tunnel_url + f'/?tkn={token}&folder={self.config.workspace_mount_path_in_sandbox}'
self._vscode_url = (
tunnel_url
+ f'/?tkn={token}&folder={self.config.workspace_mount_path_in_sandbox}'
)

self.log(
'debug',
Expand Down
13 changes: 9 additions & 4 deletions openhands/runtime/impl/remote/remote_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,8 @@ def _wait_until_alive(self):
stop=tenacity.stop_after_delay(
self.config.sandbox.remote_runtime_init_timeout
)
| stop_if_should_exit() | self._stop_if_closed,
| stop_if_should_exit()
| self._stop_if_closed,
reraise=True,
retry=tenacity.retry_if_exception_type(AgentRuntimeNotReadyError),
wait=tenacity.wait_fixed(2),
Expand Down Expand Up @@ -394,10 +395,14 @@ def _send_action_server_request(self, method, url, **kwargs):

retry_decorator = tenacity.retry(
retry=tenacity.retry_if_exception_type(ConnectionError),
stop=tenacity.stop_after_attempt(3) | stop_if_should_exit() | self._stop_if_closed,
stop=tenacity.stop_after_attempt(3)
| stop_if_should_exit()
| self._stop_if_closed,
wait=tenacity.wait_exponential(multiplier=1, min=4, max=60),
)
return retry_decorator(self._send_action_server_request_impl)(method, url, **kwargs)
return retry_decorator(self._send_action_server_request_impl)(
method, url, **kwargs
)

def _send_action_server_request_impl(self, method, url, **kwargs):
try:
Expand Down Expand Up @@ -430,6 +435,6 @@ def _send_action_server_request_impl(self, method, url, **kwargs):
) from e
else:
raise e

def _stop_if_closed(self, retry_state: tenacity.RetryCallState) -> bool:
return self._runtime_closed
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ function deactivate() {}
module.exports = {
activate,
deactivate
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@
"title": "Hello World from OpenHands"
}]
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -149,8 +149,8 @@ async def _cleanup_stale(self):
self._close_session(sid) for sid in self._local_agent_loops_by_sid
)
return
except Exception as e:
logger.error(f'error_cleaning_stale')
except Exception:
logger.error('error_cleaning_stale')
await asyncio.sleep(_CLEANUP_INTERVAL)

async def get_running_agent_loops(
Expand Down
Loading