Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 5 additions & 4 deletions docs/ios_setup/ios_setup.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
- iOS 设备(iPhone/iPad)
- USB 数据线或同一 WiFi 网络


## WebDriverAgent 配置

WebDriverAgent 是 iOS 自动化的核心组件,需要在 iOS 设备上运行。
Expand All @@ -28,7 +27,7 @@ cd WebDriverAgent

1. 在 Xcode 中选中 `WebDriverAgent`,出现General、Signing&Capabilities等选项。
2. 进入 `Signing & Capabilities` 选项卡
3. 勾选 `Automatically manage signing`。在Team中选择自己的开发者账号
3. 勾选 `Automatically manage signing`。在Team中选择自己的开发者账号
4. 将 Bundle ID 改为唯一标识符,例如:`com.yourname.WebDriverAgentRunner`
![设置签名1](resources/ios0_WebDriverAgent0.png)

Expand All @@ -43,7 +42,7 @@ Mac和iPhone有USB和WiFi两种连接方式,建议通过USB方式,成功率
#### 通过 WiFi 连接

需要满足以下条件:
1. 通过USB连接。在Finder中选中连接的IPhone,在“通用”中勾选"在 WiFi 中显示这台 iPhone"
1. 通过USB连接。在Finder中选中连接的IPhone,在“通用”中勾选"在 WiFi 中显示这台 iPhone"
2. Mac 与 iPhone 处于同一 WiFi 网络之下

#### 具体步骤
Expand All @@ -52,7 +51,7 @@ Mac和iPhone有USB和WiFi两种连接方式,建议通过USB方式,成功率

![选择设备](resources/select-your-iphone-device.png)

3. 长按"▶️"运行按钮,选择 "Test" 后开始编译并部署到你的 iPhone 上
1. 长按"▶️"运行按钮,选择 "Test" 后开始编译并部署到你的 iPhone 上

![开始测试](resources/start-wda-testing.png)

Expand Down Expand Up @@ -85,6 +84,7 @@ brew install libimobiledevice
# 设备检查
idevice_id -ln
```

2.使用xcodebuild安装WebAgent。命令行也需要进行“设备信任配置”,参考GUI模式下的方法。

```
Expand All @@ -95,6 +95,7 @@ xcodebuild -project WebDriverAgent.xcodeproj \
-destination 'platform=iOS,name=YOUR_PHONE_NAME' \
test
```

这里,YOUR_PHONE_NAME可以在xcode的GUI中看到。
WebDriverAgent 成功运行后,会在 Xcode 控制台输出类似以下信息:

Expand Down
20 changes: 9 additions & 11 deletions ios.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
PHONE_AGENT_MODEL: Model name (default: autoglm-phone-9b)
PHONE_AGENT_MAX_STEPS: Maximum steps per task (default: 100)
PHONE_AGENT_WDA_URL: WebDriverAgent URL (default: http://localhost:8100)
PHONE_AGENT_DEVICE_ID: iOS device UDID for multi-device setups
PHONE_AGENT_DEVICE_ID: iOS device UUID for multi-device setups
"""

import argparse
Expand Down Expand Up @@ -99,9 +99,7 @@ def check_system_requirements(wda_url: str = "http://localhost:8100") -> bool:
print(" 4. Or connect via WiFi using device IP")
all_passed = False
else:
device_names = [
d.device_name or d.device_id[:8] + "..." for d in devices
]
device_names = [d.device_name or d.device_id[:8] + "..." for d in devices]
print(f"✅ OK ({len(devices)} device(s): {', '.join(device_names)})")
except Exception as e:
print("❌ FAILED")
Expand Down Expand Up @@ -261,7 +259,7 @@ def parse_args() -> argparse.Namespace:
python ios.py --base-url http://localhost:8000/v1

# Run with specific device
python ios.py --device-id <UDID>
python ios.py --device-id <UUID>

# Use WiFi connection
python ios.py --wda-url http://192.168.1.100:8100
Expand Down Expand Up @@ -315,7 +313,7 @@ def parse_args() -> argparse.Namespace:
"-d",
type=str,
default=os.getenv("PHONE_AGENT_DEVICE_ID"),
help="iOS device UDID",
help="iOS device UUID",
)

parser.add_argument(
Expand All @@ -326,7 +324,9 @@ def parse_args() -> argparse.Namespace:
)

parser.add_argument(
"--list-devices", action="store_true", help="List connected iOS devices and exit"
"--list-devices",
action="store_true",
help="List connected iOS devices and exit",
)

parser.add_argument(
Expand Down Expand Up @@ -396,7 +396,7 @@ def handle_device_commands(args) -> bool:
name_info = device.device_name or "Unnamed"

print(f" ✓ {name_info}")
print(f" UDID: {device.device_id}")
print(f" UUID: {device.device_id}")
print(f" Model: {model_info}")
print(f" OS: {ios_info}")
print(f" Connection: {conn_type}")
Expand Down Expand Up @@ -474,9 +474,7 @@ def main():

# Create configurations
model_config = ModelConfig(
base_url=args.base_url,
model_name=args.model,
api_key=args.api_key
base_url=args.base_url, model_name=args.model, api_key=args.api_key
)

agent_config = IOSAgentConfig(
Expand Down
113 changes: 100 additions & 13 deletions phone_agent/actions/handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ def _send_keyevent(self, keycode: str) -> None:
# Handle HDC devices with HarmonyOS-specific keyEvent command
if device_factory.device_type == DeviceType.HDC:
hdc_prefix = ["hdc", "-t", self.device_id] if self.device_id else ["hdc"]

# Map common keycodes to HarmonyOS keyEvent codes
# KEYCODE_ENTER (66) -> 2054 (HarmonyOS Enter key code)
if keycode == "KEYCODE_ENTER" or keycode == "66":
Expand All @@ -283,7 +283,8 @@ def _send_keyevent(self, keycode: str) -> None:
# For now, only handle ENTER, other keys may need mapping
if "ENTER" in keycode:
_run_hdc_command(
hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"],
hdc_prefix
+ ["shell", "uitest", "uiInput", "keyEvent", "2054"],
capture_output=True,
text=True,
)
Expand All @@ -297,7 +298,8 @@ def _send_keyevent(self, keycode: str) -> None:
else:
# Assume it's a numeric code
_run_hdc_command(
hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", str(keycode)],
hdc_prefix
+ ["shell", "uitest", "uiInput", "keyEvent", str(keycode)],
capture_output=True,
text=True,
)
Expand Down Expand Up @@ -342,22 +344,101 @@ def parse_action(response: str) -> dict[str, Any]:
Raises:
ValueError: If the response cannot be parsed.
"""
print(f"Parsing action: {response}")
if not response or not response.strip():
return {"_metadata": "finish", "message": "Model returned an empty action."}

try:
response = response.strip()

# 1. Try to extract do(...) or finish(...) using regex if it's wrapped in other text
do_match = re.search(r"do\(.*?\)", response, re.DOTALL)
finish_match = re.search(r"finish\(.*?\)", response, re.DOTALL)

if do_match:
action_str = do_match.group(0)
# Special handling for Type action with text that might contain special characters
if 'action="Type"' in action_str or 'action="Type_Name"' in action_str:
if "text=" in action_str:
try:
# Try to extract text between quotes more robustly
text_part = action_str.split("text=", 1)[1]
# Find the first and last quote
first_quote = text_part.find('"')
last_quote = text_part.rfind('"')
if (
first_quote != -1
and last_quote != -1
and first_quote < last_quote
):
text = text_part[first_quote + 1 : last_quote]
# Extract action type
action_type = (
"Type" if 'action="Type"' in action_str else "Type_Name"
)
return {
"_metadata": "do",
"action": action_type,
"text": text,
}
except Exception:
pass # Fallback to AST if regex fails

# Standard do(...) parsing using AST
try:
# Clean up the string for AST
clean_str = (
action_str.replace("\n", "\\n")
.replace("\r", "\\r")
.replace("\t", "\\t")
)
tree = ast.parse(clean_str, mode="eval")
if isinstance(tree.body, ast.Call):
call = tree.body
action = {"_metadata": "do"}
for keyword in call.keywords:
key = keyword.arg
try:
value = ast.literal_eval(keyword.value)
action[key] = value
except (ValueError, SyntaxError):
# Fallback for non-literal values (though model should only output literals)
if isinstance(keyword.value, ast.Constant):
action[key] = keyword.value.value
else:
# Last resort: raw string representation
action[key] = str(keyword.value)
return action
except (SyntaxError, ValueError) as e:
print(f"AST parsing failed for {action_str}: {e}")
# If it's a simple do(action="Home") but AST failed, try one more manual parse
if 'action="Home"' in action_str:
return {"_metadata": "do", "action": "Home"}

if finish_match:
action_str = finish_match.group(0)
# Simple extraction for finish(message="...")
message = ""
if 'message="' in action_str:
parts = action_str.split('message="', 1)[1].rsplit('"', 1)
if len(parts) >= 1:
message = parts[0]
elif "message='" in action_str:
parts = action_str.split("message='", 1)[1].rsplit("'", 1)
if len(parts) >= 1:
message = parts[0]

return {"_metadata": "finish", "message": message}

# Legacy/Fallback behavior
if response.startswith('do(action="Type"') or response.startswith(
'do(action="Type_Name"'
):
text = response.split("text=", 1)[1][1:-2]
action = {"_metadata": "do", "action": "Type", "text": text}
return action
elif response.startswith("do"):
# Use AST parsing instead of eval for safety
try:
# Escape special characters (newlines, tabs, etc.) for valid Python syntax
response = response.replace('\n', '\\n')
response = response.replace('\r', '\\r')
response = response.replace('\t', '\\t')
response = response.replace("\n", "\\n")
response = response.replace("\r", "\\r")
response = response.replace("\t", "\\t")

tree = ast.parse(response, mode="eval")
if not isinstance(tree.body, ast.Call):
Expand All @@ -381,10 +462,16 @@ def parse_action(response: str) -> dict[str, Any]:
"message": response.replace("finish(message=", "")[1:-2],
}
else:
raise ValueError(f"Failed to parse action: {response}")
# If all parsing attempts fail, treat the entire response as a message for 'finish'
# This is more robust than crashing with a ValueError
action = {"_metadata": "finish", "message": response}
return action
except Exception as e:
raise ValueError(f"Failed to parse action: {e}")
# Final fallback: return the original response if possible
return {
"_metadata": "finish",
"message": f"Parsing failed: {str(e)}. Raw response: {response}",
}


def do(**kwargs) -> dict[str, Any]:
Expand Down
4 changes: 1 addition & 3 deletions phone_agent/actions/handler_ios.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,7 @@ def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
if not app_name:
return ActionResult(False, False, "No app name specified")

success = launch_app(
app_name, wda_url=self.wda_url, session_id=self.session_id
)
success = launch_app(app_name, wda_url=self.wda_url, session_id=self.session_id)
if success:
return ActionResult(True, False)
return ActionResult(False, False, f"App not found: {app_name}")
Expand Down
12 changes: 9 additions & 3 deletions phone_agent/adb/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ def disconnect(self, address: str | None = None) -> tuple[bool, str]:
if address:
cmd.append(address)

result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5)
result = subprocess.run(
cmd, capture_output=True, text=True, encoding="utf-8", timeout=5
)

output = result.stdout + result.stderr
return True, output.strip() or "Disconnected"
Expand Down Expand Up @@ -241,7 +243,9 @@ def enable_tcpip(
cmd.extend(["-s", device_id])
cmd.extend(["tcpip", str(port)])

result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=10)
result = subprocess.run(
cmd, capture_output=True, text=True, encoding="utf-8", timeout=10
)

output = result.stdout + result.stderr

Expand Down Expand Up @@ -270,7 +274,9 @@ def get_device_ip(self, device_id: str | None = None) -> str | None:
cmd.extend(["-s", device_id])
cmd.extend(["shell", "ip", "route"])

result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5)
result = subprocess.run(
cmd, capture_output=True, text=True, encoding="utf-8", timeout=5
)

# Parse IP from route output
for line in result.stdout.split("\n"):
Expand Down
5 changes: 4 additions & 1 deletion phone_agent/adb/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,10 @@ def get_current_app(device_id: str | None = None) -> str:
adb_prefix = _get_adb_prefix(device_id)

result = subprocess.run(
adb_prefix + ["shell", "dumpsys", "window"], capture_output=True, text=True, encoding="utf-8"
adb_prefix + ["shell", "dumpsys", "window"],
capture_output=True,
text=True,
encoding="utf-8",
)
output = result.stdout
if not output:
Expand Down
8 changes: 3 additions & 5 deletions phone_agent/agent.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
"""Main PhoneAgent class for orchestrating phone automation."""

import json
import os
import traceback
from dataclasses import dataclass
from datetime import datetime
from typing import Any, Callable

from phone_agent.actions import ActionHandler
Expand Down Expand Up @@ -171,9 +173,6 @@ def _execute_step(
# Get model response
try:
msgs = get_messages(self.agent_config.lang)
print("\n" + "=" * 50)
print(f"💭 {msgs['thinking']}:")
print("-" * 50)
response = self.model_client.request(self._context)
except Exception as e:
if self.agent_config.verbose:
Expand All @@ -195,8 +194,7 @@ def _execute_step(
action = finish(message=response.action)

if self.agent_config.verbose:
# Print thinking process
print("-" * 50)
# Print parsed action
print(f"🎯 {msgs['action']}:")
print(json.dumps(action, ensure_ascii=False, indent=2))
print("=" * 50 + "\n")
Expand Down
9 changes: 2 additions & 7 deletions phone_agent/agent_ios.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ class IOSAgentConfig:
max_steps: int = 100
wda_url: str = "http://localhost:8100"
session_id: str | None = None
device_id: str | None = None # iOS device UDID
device_id: str | None = None # iOS device UUID
lang: str = "cn"
system_prompt: str | None = None
verbose: bool = True
Expand Down Expand Up @@ -214,13 +214,8 @@ def _execute_step(
action = finish(message=response.action)

if self.agent_config.verbose:
# Print thinking process
# Print parsed action
msgs = get_messages(self.agent_config.lang)
print("\n" + "=" * 50)
print(f"💭 {msgs['thinking']}:")
print("-" * 50)
print(response.thinking)
print("-" * 50)
print(f"🎯 {msgs['action']}:")
print(json.dumps(action, ensure_ascii=False, indent=2))
print("=" * 50 + "\n")
Expand Down
Loading