From e50a599d4fe6fadae90c19dfabaf12f976483bb4 Mon Sep 17 00:00:00 2001 From: Alan Zhu Date: Tue, 23 Dec 2025 12:04:37 +0800 Subject: [PATCH] Auto-detect iOS WDA screen scale for coordinate conversion --- main.py | 22 +++++++++- phone_agent/xctest/__init__.py | 8 ++++ phone_agent/xctest/connection.py | 72 ++++++++++++++++++++++++++++++++ phone_agent/xctest/device.py | 53 +++++++++++++++++------ 4 files changed, 142 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 8cdc34b2..1469a0a9 100755 --- a/main.py +++ b/main.py @@ -30,7 +30,7 @@ from phone_agent.config.apps_ios import list_supported_apps as list_ios_apps from phone_agent.device_factory import DeviceType, get_device_factory, set_device_type from phone_agent.model import ModelConfig -from phone_agent.xctest import XCTestConnection +from phone_agent.xctest import XCTestConnection, set_scale_factor from phone_agent.xctest import list_devices as list_ios_devices @@ -753,6 +753,26 @@ def main(): ) if device_type == DeviceType.IOS: + # Auto-detect iOS WDA scale factor (pixels -> points) instead of hard-coding 3. + # You can override it with env PHONE_AGENT_IOS_SCALE (float). + detected_scale: float | None = None + try: + env_scale = os.getenv("PHONE_AGENT_IOS_SCALE") + if env_scale: + detected_scale = float(env_scale) + else: + detected_scale = XCTestConnection(wda_url=args.wda_url).detect_screen_scale( + session_id=None, + device_id=args.device_id, + default=3.0, + ) + + if detected_scale and detected_scale > 0: + set_scale_factor(detected_scale) + except Exception: + # Fall back to default scale in xctest.device + pass + # Create iOS agent agent_config = IOSAgentConfig( max_steps=args.max_steps, diff --git a/phone_agent/xctest/__init__.py b/phone_agent/xctest/__init__.py index f9fad38a..f69aadb7 100644 --- a/phone_agent/xctest/__init__.py +++ b/phone_agent/xctest/__init__.py @@ -10,10 +10,12 @@ from phone_agent.xctest.device import ( back, double_tap, + get_scale_factor, get_current_app, home, launch_app, long_press, + set_scale_factor, swipe, tap, ) @@ -38,6 +40,8 @@ "double_tap", "long_press", "launch_app", + "set_scale_factor", + "get_scale_factor", # Connection management "XCTestConnection", "DeviceInfo", @@ -45,3 +49,7 @@ "quick_connect", "list_devices", ] + +# Re-export convenience methods (available on XCTestConnection). +# Kept for discoverability in higher-level modules. + diff --git a/phone_agent/xctest/connection.py b/phone_agent/xctest/connection.py index deb29369..724c8822 100644 --- a/phone_agent/xctest/connection.py +++ b/phone_agent/xctest/connection.py @@ -252,6 +252,78 @@ def start_wda_session(self) -> tuple[bool, str]: except Exception as e: return False, f"Error starting WDA session: {e}" + def get_wda_screen(self) -> dict | None: + """Get WDA screen information (/wda/screen). + + Returns: + The JSON-decoded response dict on success, otherwise None. + """ + try: + import requests + + response = requests.get(f"{self.wda_url}/wda/screen", timeout=5, verify=False) + if response.status_code == 200: + return response.json() + return None + except Exception: + return None + + def detect_screen_scale( + self, + session_id: str | None = None, + device_id: str | None = None, + default: float = 3.0, + ) -> float: + """Detect iOS screen scale factor used by WDA coordinate system. + + Priority: + 1) Use `/wda/screen` -> value.scale if available. + 2) Fallback: infer scale by comparing screenshot pixel size with screenSize points. + + This keeps the detection details inside xctest, so CLI/agent code can stay clean. + """ + screen = self.get_wda_screen() + try: + if screen and isinstance(screen, dict): + value = screen.get("value", {}) or {} + scale = value.get("scale") + if isinstance(scale, (int, float)) and scale > 0: + return float(scale) + + screen_size = value.get("screenSize", {}) or {} + width_pt = screen_size.get("width") + height_pt = screen_size.get("height") + + if isinstance(width_pt, (int, float)) and isinstance(height_pt, (int, float)): + # Import locally to avoid circular import at module import time. + from phone_agent.xctest.screenshot import get_screenshot + + shot = get_screenshot( + wda_url=self.wda_url, + session_id=session_id, + device_id=device_id, + ) + + # Best-effort inference: choose the more stable ratio between width/height. + ratio_w = shot.width / float(width_pt) if width_pt else 0 + ratio_h = shot.height / float(height_pt) if height_pt else 0 + ratio = ratio_w if ratio_w > 0 else ratio_h + if ratio_h > 0 and ratio_w > 0: + # If both available, use the rounded average to reduce rotation noise. + ratio = (ratio_w + ratio_h) / 2 + + # WDA scale is typically 1/2/3. Round to nearest int if close. + rounded = round(ratio) if ratio > 0 else 0 + if rounded in (1, 2, 3) and abs(ratio - rounded) < 0.25: + return float(rounded) + if ratio > 0: + return float(ratio) + except Exception: + # Never block agent startup due to scale detection. + pass + + return float(default) + def get_wda_status(self) -> dict | None: """ Get WebDriverAgent status information. diff --git a/phone_agent/xctest/device.py b/phone_agent/xctest/device.py index 49fc379c..b07abb09 100644 --- a/phone_agent/xctest/device.py +++ b/phone_agent/xctest/device.py @@ -6,7 +6,27 @@ from phone_agent.config.apps_ios import APP_PACKAGES_IOS as APP_PACKAGES -SCALE_FACTOR = 3 # 3 for most modern iPhone +# WDA expects coordinates in "points" while our higher-level code +# mostly works in screenshot pixel coordinates. +# This factor converts pixels -> points. +_SCALE_FACTOR: float = 3.0 # default for many modern iPhones + + +def set_scale_factor(scale: float) -> None: + """Set the global pixel->point scale factor used for coordinate conversion.""" + global _SCALE_FACTOR + try: + scale_f = float(scale) + if scale_f > 0: + _SCALE_FACTOR = scale_f + except Exception: + # Keep previous value on invalid input + return + + +def get_scale_factor() -> float: + """Get the global pixel->point scale factor used for coordinate conversion.""" + return float(_SCALE_FACTOR) def _get_wda_session_url(wda_url: str, session_id: str | None, endpoint: str) -> str: """ @@ -95,6 +115,7 @@ def tap( url = _get_wda_session_url(wda_url, session_id, "actions") # W3C WebDriver Actions API for tap/click + scale = get_scale_factor() actions = { "actions": [ { @@ -102,7 +123,7 @@ def tap( "id": "finger1", "parameters": {"pointerType": "touch"}, "actions": [ - {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR}, + {"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale}, {"type": "pointerDown", "button": 0}, {"type": "pause", "duration": 0.1}, {"type": "pointerUp", "button": 0}, @@ -143,6 +164,8 @@ def double_tap( url = _get_wda_session_url(wda_url, session_id, "actions") + scale = get_scale_factor() + # W3C WebDriver Actions API for double tap actions = { "actions": [ @@ -151,11 +174,12 @@ def double_tap( "id": "finger1", "parameters": {"pointerType": "touch"}, "actions": [ - {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR}, + {"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale}, {"type": "pointerDown", "button": 0}, {"type": "pause", "duration": 100}, {"type": "pointerUp", "button": 0}, {"type": "pause", "duration": 100}, + {"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale}, {"type": "pointerDown", "button": 0}, {"type": "pause", "duration": 100}, {"type": "pointerUp", "button": 0}, @@ -202,6 +226,7 @@ def long_press( # Convert duration to milliseconds duration_ms = int(duration * 1000) + scale = get_scale_factor() actions = { "actions": [ { @@ -209,7 +234,7 @@ def long_press( "id": "finger1", "parameters": {"pointerType": "touch"}, "actions": [ - {"type": "pointerMove", "duration": 0, "x": x / SCALE_FACTOR, "y": y / SCALE_FACTOR}, + {"type": "pointerMove", "duration": 0, "x": x / scale, "y": y / scale}, {"type": "pointerDown", "button": 0}, {"type": "pause", "duration": duration_ms}, {"type": "pointerUp", "button": 0}, @@ -262,12 +287,14 @@ def swipe( url = _get_wda_session_url(wda_url, session_id, "wda/dragfromtoforduration") + scale = get_scale_factor() + # WDA dragfromtoforduration API payload payload = { - "fromX": start_x / SCALE_FACTOR, - "fromY": start_y / SCALE_FACTOR, - "toX": end_x / SCALE_FACTOR, - "toY": end_y / SCALE_FACTOR, + "fromX": start_x / scale, + "fromY": start_y / scale, + "toX": end_x / scale, + "toY": end_y / scale, "duration": duration, } @@ -303,12 +330,14 @@ def back( url = _get_wda_session_url(wda_url, session_id, "wda/dragfromtoforduration") + scale = get_scale_factor() + # Swipe from left edge to simulate back gesture payload = { - "fromX": 0, - "fromY": 640, - "toX": 400, - "toY": 640, + "fromX": 0 / scale, + "fromY": 640 / scale, + "toX": 400 / scale, + "toY": 640 / scale, "duration": 0.3, }