zai-org · Einspanner123 · Jan 6, 2026 · Jan 6, 2026
diff --git a/docs/ios_setup/ios_setup.md b/docs/ios_setup/ios_setup.md
@@ -10,7 +10,6 @@
 - iOS 设备（iPhone/iPad）
 - USB 数据线或同一 WiFi 网络
 
-
 ## WebDriverAgent 配置
 
 WebDriverAgent 是 iOS 自动化的核心组件，需要在 iOS 设备上运行。
@@ -28,7 +27,7 @@ cd WebDriverAgent
 
 1. 在 Xcode 中选中 `WebDriverAgent`，出现General、Signing&Capabilities等选项。
 2. 进入 `Signing & Capabilities` 选项卡
-3.   勾选 `Automatically manage signing`。在Team中选择自己的开发者账号
+3. 勾选 `Automatically manage signing`。在Team中选择自己的开发者账号
 4. 将 Bundle ID 改为唯一标识符，例如：`com.yourname.WebDriverAgentRunner`
 ![设置签名1](resources/ios0_WebDriverAgent0.png)
 
@@ -43,7 +42,7 @@ Mac和iPhone有USB和WiFi两种连接方式，建议通过USB方式，成功率
 #### 通过 WiFi 连接
 
 需要满足以下条件：
-1.  通过USB连接。在Finder中选中连接的IPhone，在“通用”中勾选"在 WiFi 中显示这台 iPhone"
+1. 通过USB连接。在Finder中选中连接的IPhone，在“通用”中勾选"在 WiFi 中显示这台 iPhone"
 2. Mac 与 iPhone 处于同一 WiFi 网络之下
 
 #### 具体步骤
@@ -52,7 +51,7 @@ Mac和iPhone有USB和WiFi两种连接方式，建议通过USB方式，成功率
 
 ![选择设备](resources/select-your-iphone-device.png)
 
-3. 长按"▶️"运行按钮，选择 "Test" 后开始编译并部署到你的 iPhone 上
+1. 长按"▶️"运行按钮，选择 "Test" 后开始编译并部署到你的 iPhone 上
 
 ![开始测试](resources/start-wda-testing.png)
 
@@ -85,6 +84,7 @@ brew install libimobiledevice
 # 设备检查
 idevice_id -ln
 ```
+
 2.使用xcodebuild安装WebAgent。命令行也需要进行“设备信任配置”，参考GUI模式下的方法。
 
 ```
@@ -95,6 +95,7 @@ xcodebuild -project WebDriverAgent.xcodeproj \
            -destination 'platform=iOS,name=YOUR_PHONE_NAME' \
            test
 ```
+
 这里，YOUR_PHONE_NAME可以在xcode的GUI中看到。
 WebDriverAgent 成功运行后，会在 Xcode 控制台输出类似以下信息：
 

diff --git a/ios.py b/ios.py
@@ -10,7 +10,7 @@
     PHONE_AGENT_MODEL: Model name (default: autoglm-phone-9b)
     PHONE_AGENT_MAX_STEPS: Maximum steps per task (default: 100)
     PHONE_AGENT_WDA_URL: WebDriverAgent URL (default: http://localhost:8100)
-    PHONE_AGENT_DEVICE_ID: iOS device UDID for multi-device setups
+    PHONE_AGENT_DEVICE_ID: iOS device UUID for multi-device setups
 """
 
 import argparse
@@ -99,9 +99,7 @@ def check_system_requirements(wda_url: str = "http://localhost:8100") -> bool:
             print("     4. Or connect via WiFi using device IP")
             all_passed = False
         else:
-            device_names = [
-                d.device_name or d.device_id[:8] + "..." for d in devices
-            ]
+            device_names = [d.device_name or d.device_id[:8] + "..." for d in devices]
             print(f"✅ OK ({len(devices)} device(s): {', '.join(device_names)})")
     except Exception as e:
         print("❌ FAILED")
@@ -261,7 +259,7 @@ def parse_args() -> argparse.Namespace:
     python ios.py --base-url http://localhost:8000/v1
 
     # Run with specific device
-    python ios.py --device-id <UDID>
+    python ios.py --device-id <UUID>
 
     # Use WiFi connection
     python ios.py --wda-url http://192.168.1.100:8100
@@ -315,7 +313,7 @@ def parse_args() -> argparse.Namespace:
         "-d",
         type=str,
         default=os.getenv("PHONE_AGENT_DEVICE_ID"),
-        help="iOS device UDID",
+        help="iOS device UUID",
     )
 
     parser.add_argument(
@@ -326,7 +324,9 @@ def parse_args() -> argparse.Namespace:
     )
 
     parser.add_argument(
-        "--list-devices", action="store_true", help="List connected iOS devices and exit"
+        "--list-devices",
+        action="store_true",
+        help="List connected iOS devices and exit",
     )
 
     parser.add_argument(
@@ -396,7 +396,7 @@ def handle_device_commands(args) -> bool:
                 name_info = device.device_name or "Unnamed"
 
                 print(f"  ✓ {name_info}")
-                print(f"    UDID: {device.device_id}")
+                print(f"    UUID: {device.device_id}")
                 print(f"    Model: {model_info}")
                 print(f"    OS: {ios_info}")
                 print(f"    Connection: {conn_type}")
@@ -474,9 +474,7 @@ def main():
 
     # Create configurations
     model_config = ModelConfig(
-        base_url=args.base_url,
-        model_name=args.model,
-        api_key=args.api_key
+        base_url=args.base_url, model_name=args.model, api_key=args.api_key
     )
 
     agent_config = IOSAgentConfig(

diff --git a/phone_agent/actions/handler.py b/phone_agent/actions/handler.py
@@ -265,7 +265,7 @@ def _send_keyevent(self, keycode: str) -> None:
         # Handle HDC devices with HarmonyOS-specific keyEvent command
         if device_factory.device_type == DeviceType.HDC:
             hdc_prefix = ["hdc", "-t", self.device_id] if self.device_id else ["hdc"]
-            
+
             # Map common keycodes to HarmonyOS keyEvent codes
             # KEYCODE_ENTER (66) -> 2054 (HarmonyOS Enter key code)
             if keycode == "KEYCODE_ENTER" or keycode == "66":
@@ -283,7 +283,8 @@ def _send_keyevent(self, keycode: str) -> None:
                         # For now, only handle ENTER, other keys may need mapping
                         if "ENTER" in keycode:
                             _run_hdc_command(
-                                hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", "2054"],
+                                hdc_prefix
+                                + ["shell", "uitest", "uiInput", "keyEvent", "2054"],
                                 capture_output=True,
                                 text=True,
                             )
@@ -297,7 +298,8 @@ def _send_keyevent(self, keycode: str) -> None:
                     else:
                         # Assume it's a numeric code
                         _run_hdc_command(
-                            hdc_prefix + ["shell", "uitest", "uiInput", "keyEvent", str(keycode)],
+                            hdc_prefix
+                            + ["shell", "uitest", "uiInput", "keyEvent", str(keycode)],
                             capture_output=True,
                             text=True,
                         )
@@ -342,22 +344,101 @@ def parse_action(response: str) -> dict[str, Any]:
     Raises:
         ValueError: If the response cannot be parsed.
     """
-    print(f"Parsing action: {response}")
+    if not response or not response.strip():
+        return {"_metadata": "finish", "message": "Model returned an empty action."}
+
     try:
         response = response.strip()
+
+        # 1. Try to extract do(...) or finish(...) using regex if it's wrapped in other text
+        do_match = re.search(r"do\(.*?\)", response, re.DOTALL)
+        finish_match = re.search(r"finish\(.*?\)", response, re.DOTALL)
+
+        if do_match:
+            action_str = do_match.group(0)
+            # Special handling for Type action with text that might contain special characters
+            if 'action="Type"' in action_str or 'action="Type_Name"' in action_str:
+                if "text=" in action_str:
+                    try:
+                        # Try to extract text between quotes more robustly
+                        text_part = action_str.split("text=", 1)[1]
+                        # Find the first and last quote
+                        first_quote = text_part.find('"')
+                        last_quote = text_part.rfind('"')
+                        if (
+                            first_quote != -1
+                            and last_quote != -1
+                            and first_quote < last_quote
+                        ):
+                            text = text_part[first_quote + 1 : last_quote]
+                            # Extract action type
+                            action_type = (
+                                "Type" if 'action="Type"' in action_str else "Type_Name"
+                            )
+                            return {
+                                "_metadata": "do",
+                                "action": action_type,
+                                "text": text,
+                            }
+                    except Exception:
+                        pass  # Fallback to AST if regex fails
+
+            # Standard do(...) parsing using AST
+            try:
+                # Clean up the string for AST
+                clean_str = (
+                    action_str.replace("\n", "\\n")
+                    .replace("\r", "\\r")
+                    .replace("\t", "\\t")
+                )
+                tree = ast.parse(clean_str, mode="eval")
+                if isinstance(tree.body, ast.Call):
+                    call = tree.body
+                    action = {"_metadata": "do"}
+                    for keyword in call.keywords:
+                        key = keyword.arg
+                        try:
+                            value = ast.literal_eval(keyword.value)
+                            action[key] = value
+                        except (ValueError, SyntaxError):
+                            # Fallback for non-literal values (though model should only output literals)
+                            if isinstance(keyword.value, ast.Constant):
+                                action[key] = keyword.value.value
+                            else:
+                                # Last resort: raw string representation
+                                action[key] = str(keyword.value)
+                    return action
+            except (SyntaxError, ValueError) as e:
+                print(f"AST parsing failed for {action_str}: {e}")
+                # If it's a simple do(action="Home") but AST failed, try one more manual parse
+                if 'action="Home"' in action_str:
+                    return {"_metadata": "do", "action": "Home"}
+
+        if finish_match:
+            action_str = finish_match.group(0)
+            # Simple extraction for finish(message="...")
+            message = ""
+            if 'message="' in action_str:
+                parts = action_str.split('message="', 1)[1].rsplit('"', 1)
+                if len(parts) >= 1:
+                    message = parts[0]
+            elif "message='" in action_str:
+                parts = action_str.split("message='", 1)[1].rsplit("'", 1)
+                if len(parts) >= 1:
+                    message = parts[0]
+
+            return {"_metadata": "finish", "message": message}
+
+        # Legacy/Fallback behavior
         if response.startswith('do(action="Type"') or response.startswith(
             'do(action="Type_Name"'
         ):
-            text = response.split("text=", 1)[1][1:-2]
-            action = {"_metadata": "do", "action": "Type", "text": text}
-            return action
-        elif response.startswith("do"):
             # Use AST parsing instead of eval for safety
             try:
                 # Escape special characters (newlines, tabs, etc.) for valid Python syntax
-                response = response.replace('\n', '\\n')
-                response = response.replace('\r', '\\r')
-                response = response.replace('\t', '\\t')
+                response = response.replace("\n", "\\n")
+                response = response.replace("\r", "\\r")
+                response = response.replace("\t", "\\t")
 
                 tree = ast.parse(response, mode="eval")
                 if not isinstance(tree.body, ast.Call):
@@ -381,10 +462,16 @@ def parse_action(response: str) -> dict[str, Any]:
                 "message": response.replace("finish(message=", "")[1:-2],
             }
         else:
-            raise ValueError(f"Failed to parse action: {response}")
+            # If all parsing attempts fail, treat the entire response as a message for 'finish'
+            # This is more robust than crashing with a ValueError
+            action = {"_metadata": "finish", "message": response}
         return action
     except Exception as e:
-        raise ValueError(f"Failed to parse action: {e}")
+        # Final fallback: return the original response if possible
+        return {
+            "_metadata": "finish",
+            "message": f"Parsing failed: {str(e)}. Raw response: {response}",
+        }
 
 
 def do(**kwargs) -> dict[str, Any]:

diff --git a/phone_agent/actions/handler_ios.py b/phone_agent/actions/handler_ios.py
@@ -129,9 +129,7 @@ def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
         if not app_name:
             return ActionResult(False, False, "No app name specified")
 
-        success = launch_app(
-            app_name, wda_url=self.wda_url, session_id=self.session_id
-        )
+        success = launch_app(app_name, wda_url=self.wda_url, session_id=self.session_id)
         if success:
             return ActionResult(True, False)
         return ActionResult(False, False, f"App not found: {app_name}")

diff --git a/phone_agent/adb/connection.py b/phone_agent/adb/connection.py
@@ -109,7 +109,9 @@ def disconnect(self, address: str | None = None) -> tuple[bool, str]:
             if address:
                 cmd.append(address)
 
-            result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5)
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, encoding="utf-8", timeout=5
+            )
 
             output = result.stdout + result.stderr
             return True, output.strip() or "Disconnected"
@@ -241,7 +243,9 @@ def enable_tcpip(
                 cmd.extend(["-s", device_id])
             cmd.extend(["tcpip", str(port)])
 
-            result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=10)
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, encoding="utf-8", timeout=10
+            )
 
             output = result.stdout + result.stderr
 
@@ -270,7 +274,9 @@ def get_device_ip(self, device_id: str | None = None) -> str | None:
                 cmd.extend(["-s", device_id])
             cmd.extend(["shell", "ip", "route"])
 
-            result = subprocess.run(cmd, capture_output=True, text=True, encoding="utf-8", timeout=5)
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, encoding="utf-8", timeout=5
+            )
 
             # Parse IP from route output
             for line in result.stdout.split("\n"):

diff --git a/phone_agent/adb/device.py b/phone_agent/adb/device.py
@@ -22,7 +22,10 @@ def get_current_app(device_id: str | None = None) -> str:
     adb_prefix = _get_adb_prefix(device_id)
 
     result = subprocess.run(
-        adb_prefix + ["shell", "dumpsys", "window"], capture_output=True, text=True, encoding="utf-8"
+        adb_prefix + ["shell", "dumpsys", "window"],
+        capture_output=True,
+        text=True,
+        encoding="utf-8",
     )
     output = result.stdout
     if not output:

diff --git a/phone_agent/agent.py b/phone_agent/agent.py
@@ -1,8 +1,10 @@
 """Main PhoneAgent class for orchestrating phone automation."""
 
 import json
+import os
 import traceback
 from dataclasses import dataclass
+from datetime import datetime
 from typing import Any, Callable
 
 from phone_agent.actions import ActionHandler
@@ -171,9 +173,6 @@ def _execute_step(
         # Get model response
         try:
             msgs = get_messages(self.agent_config.lang)
-            print("\n" + "=" * 50)
-            print(f"💭 {msgs['thinking']}:")
-            print("-" * 50)
             response = self.model_client.request(self._context)
         except Exception as e:
             if self.agent_config.verbose:
@@ -195,8 +194,7 @@ def _execute_step(
             action = finish(message=response.action)
 
         if self.agent_config.verbose:
-            # Print thinking process
-            print("-" * 50)
+            # Print parsed action
             print(f"🎯 {msgs['action']}:")
             print(json.dumps(action, ensure_ascii=False, indent=2))
             print("=" * 50 + "\n")

diff --git a/phone_agent/agent_ios.py b/phone_agent/agent_ios.py
@@ -20,7 +20,7 @@ class IOSAgentConfig:
     max_steps: int = 100
     wda_url: str = "http://localhost:8100"
     session_id: str | None = None
-    device_id: str | None = None  # iOS device UDID
+    device_id: str | None = None  # iOS device UUID
     lang: str = "cn"
     system_prompt: str | None = None
     verbose: bool = True
@@ -214,13 +214,8 @@ def _execute_step(
             action = finish(message=response.action)
 
         if self.agent_config.verbose:
-            # Print thinking process
+            # Print parsed action
             msgs = get_messages(self.agent_config.lang)
-            print("\n" + "=" * 50)
-            print(f"💭 {msgs['thinking']}:")
-            print("-" * 50)
-            print(response.thinking)
-            print("-" * 50)
             print(f"🎯 {msgs['action']}:")
             print(json.dumps(action, ensure_ascii=False, indent=2))
             print("=" * 50 + "\n")