AppAgent 源码 (self_explore.py)
1. 代码概述
AppAgent
是一个用于 Android 应用的自主探索工具。它通过模拟用户操作(如点击、滑动、输入等)来探索应用界面,并根据探索结果生成文档。代码的主要功能包括:
- 连接到 Android 设备并获取屏幕信息。
- 解析应用的 UI 元素。
- 使用大语言模型(如 OpenAI 或 Qwen)生成操作指令。
- 执行操作并记录结果。
- 根据操作结果生成文档。
2. 代码运行逻辑
2.1 初始化
import argparse
import ast
import datetime
import json
import os
import re
import sys
import time
import prompts
from config import load_config
from and_controller import list_all_devices, AndroidController, traverse_tree
from model import parse_explore_rsp, parse_reflect_rsp, OpenAIModel, QwenModel
from utils import print_with_color, draw_bbox_multi
- argparse: 用于解析命令行参数。
- ast: 用于解析字符串形式的 Python 数据结构。
- datetime: 用于处理日期和时间。
- json: 用于处理 JSON 数据。
- os: 用于处理文件和目录路径。
- re: 用于正则表达式操作。
- sys: 用于系统相关的操作,如退出程序。
- time: 用于处理时间相关的操作。
2.2 解析命令行参数
arg_desc = "AppAgent - Autonomous Exploration"
parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description=arg_desc)
parser.add_argument("--app")
parser.add_argument("--root_dir", default="./")
args = vars(parser.parse_args())
--app
: 指定目标应用的名称。--root_dir
: 指定工作目录,默认为当前目录。
2.3 加载配置
configs = load_config()
- 从配置文件中加载配置项,如模型类型、API 密钥等。
2.4 初始化大语言模型
if configs["MODEL"] == "OpenAI":
mllm = OpenAIModel(base_url=configs["OPENAI_API_BASE"],
api_key=configs["OPENAI_API_KEY"],
model=configs["OPENAI_API_MODEL"],
temperature=configs["TEMPERATURE"],
max_tokens=configs["MAX_TOKENS"])
elif configs["MODEL"] == "Qwen":
mllm = QwenModel(api_key=configs["DASHSCOPE_API_KEY"],
model=configs["QWEN_MODEL"])
else:
print_with_color(f"ERROR: Unsupported model type {configs['MODEL']}!", "red")
sys.exit()
- 根据配置选择使用 OpenAI 或 Qwen 模型。
2.5 设置工作目录
app = args["app"]
root_dir = args["root_dir"]
if not app:
print_with_color("What is the name of the target app?", "blue")
app = input()
app = app.replace(" ", "")
work_dir = os.path.join(root_dir, "apps")
if not os.path.exists(work_dir):
os.mkdir(work_dir)
work_dir = os.path.join(work_dir, app)
if not os.path.exists(work_dir):
os.mkdir(work_dir)
demo_dir = os.path.join(work_dir, "demos")
if not os.path.exists(demo_dir):
os.mkdir(demo_dir)
demo_timestamp = int(time.time())
task_name = datetime.datetime.fromtimestamp(demo_timestamp).strftime("self_explore_%Y-%m-%d_%H-%M-%S")
task_dir = os.path.join(demo_dir, task_name)
os.mkdir(task_dir)
docs_dir = os.path.join(work_dir, "auto_docs")
if not os.path.exists(docs_dir):
os.mkdir(docs_dir)
explore_log_path = os.path.join(task_dir, f"log_explore_{task_name}.txt")
reflect_log_path = os.path.join(task_dir, f"log_reflect_{task_name}.txt")
- 创建应用的工作目录、演示目录、文档目录等。
2.6 连接 Android 设备
device_list = list_all_devices()
if not device_list:
print_with_color("ERROR: No device found!", "red")
sys.exit()
print_with_color(f"List of devices attached:
{str(device_list)}", "yellow")
if len(device_list) == 1:
device = device_list[0]
print_with_color(f"Device selected: {device}", "yellow")
else:
print_with_color("Please choose the Android device to start demo by entering its ID:", "blue")
device = input()
controller = AndroidController(device)
width, height = controller.get_device_size()
if not width and not height:
print_with_color("ERROR: Invalid device size!", "red")
sys.exit()
print_with_color(f"Screen resolution of {device}: {width}x{height}", "yellow")
- 列出所有连接的 Android 设备,并选择其中一个。
- 获取设备的屏幕分辨率。
2.7 输入任务描述
print_with_color("Please enter the description of the task you want me to complete in a few sentences:", "blue")
task_desc = input()
- 用户输入任务描述,指导工具进行探索。
2.8 自主探索循环
round_count = 0
doc_count = 0
useless_list = set()
last_act = "None"
task_complete = False
while round_count < configs["MAX_ROUNDS"]:
round_count += 1
print_with_color(f"Round {round_count}", "yellow")
screenshot_before = controller.get_screenshot(f"{round_count}_before", task_dir)
xml_path = controller.get_xml(f"{round_count}", task_dir)
if screenshot_before == "ERROR" or xml_path == "ERROR":
break
clickable_list = []
focusable_list = []
traverse_tree(xml_path, clickable_list, "clickable", True)
traverse_tree(xml_path, focusable_list, "focusable", True)
elem_list = []
for elem in clickable_list:
if elem.uid in useless_list:
continue
elem_list.append(elem)
for elem in focusable_list:
if elem.uid in useless_list:
continue
bbox = elem.bbox
center = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
close = False
for e in clickable_list:
bbox = e.bbox
center_ = (bbox[0][0] + bbox[1][0]) // 2, (bbox[0][1] + bbox[1][1]) // 2
dist = (abs(center[0] - center_[0]) ** 2 + abs(center[1] - center_[1]) ** 2) ** 0.5
if dist <= configs["MIN_DIST"]:
close = True
break
if not close:
elem_list.append(elem)
draw_bbox_multi(screenshot_before, os.path.join(task_dir, f"{round_count}_before_labeled.png"), elem_list,
dark_mode=configs["DARK_MODE"])
prompt = re.sub(r"<task_description>", task_desc, prompts.self_explore_task_template)
prompt = re.sub(r"<last_act>", last_act, prompt)
base64_img_before = os.path.join(task_dir, f"{round_count}_before_labeled.png")
print_with_color("Thinking about what to do in the next step...", "yellow")
status, rsp = mllm.get_model_response(prompt, [base64_img_before])
if status:
with open(explore_log_path, "a") as logfile:
log_item = {"step": round_count, "prompt": prompt, "image": f"{round_count}_before_labeled.png",
"response": rsp}
logfile.write(json.dumps(log_item) + "
")
res = parse_explore_rsp(rsp)
act_name = res[0]
last_act = res[-1]
res = res[:-1]
if act_name == "FINISH":
task_complete = True
break
if act_name == "tap":
_, area = res
tl, br = elem_list[area - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
ret = controller.tap(x, y)
if ret == "ERROR":
print_with_color("ERROR: tap execution failed", "red")
break
elif act_name == "text":
_, input_str = res
ret = controller.text(input_str)
if ret == "ERROR":
print_with_color("ERROR: text execution failed", "red")
break
elif act_name == "long_press":
_, area = res
tl, br = elem_list[area - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
ret = controller.long_press(x, y)
if ret == "ERROR":
print_with_color("ERROR: long press execution failed", "red")
break
elif act_name == "swipe":
_, area, swipe_dir, dist = res
tl, br = elem_list[area - 1].bbox
x, y = (tl[0] + br[0]) // 2, (tl[1] + br[1]) // 2
ret = controller.swipe(x, y, swipe_dir, dist)
if ret == "ERROR":
print_with_color("ERROR: swipe execution failed", "red")
break
else:
break
time.sleep(configs["REQUEST_INTERVAL"])
else:
print_with_color(rsp, "red")
break
screenshot_after = controller.get_screenshot(f"{round_count}_after", task_dir)
if screenshot_after == "ERROR":
break
draw_bbox_multi(screenshot_after, os.path.join(task_dir, f"{round_count}_after_labeled.png"), elem_list,
dark_mode=configs["DARK_MODE"])
base64_img_after = os.path.join(task_dir, f"{round_count}_after_labeled.png")
if act_name == "tap":
prompt = re.sub(r"<action>", "tapping", prompts.self_explore_reflect_template)
elif act_name == "text":
continue
elif act_name == "long_press":
prompt = re.sub(r"<action>", "long pressing", prompts.self_explore_reflect_template)
elif act_name == "swipe":
swipe_dir = res[2]
if swipe_dir == "up" or swipe_dir == "down":
act_name = "v_swipe"
elif swipe_dir == "left" or swipe_dir == "right":
act_name = "h_swipe"
prompt = re.sub(r"<action>", "swiping", prompts.self_explore_reflect_template)
else:
print_with_color("ERROR: Undefined act!", "red")
break
prompt = re.sub(r"<ui_element>", str(area), prompt)
prompt = re.sub(r"<task_desc>", task_desc, prompt)
prompt = re.sub(r"<last_act>", last_act, prompt)
print_with_color("Reflecting on my previous action...", "yellow")
status, rsp = mllm.get_model_response(prompt, [base64_img_before, base64_img_after])
if status:
resource_id = elem_list[int(area) - 1].uid
with open(reflect_log_path, "a") as logfile:
log_item = {"step": round_count, "prompt": prompt, "image_before": f"{round_count}_before_labeled.png",
"image_after": f"{round_count}_after.png", "response": rsp}
logfile.write(json.dumps(log_item) + "
")
res = parse_reflect_rsp(rsp)
decision = res[0]
if decision == "ERROR":
break
if decision == "INEFFECTIVE":
useless_list.add(resource_id)
last_act = "None"
elif decision == "BACK" or decision == "CONTINUE" or decision == "SUCCESS":
if decision == "BACK" or decision == "CONTINUE":
useless_list.add(resource_id)
last_act = "None"
if decision == "BACK":
ret = controller.back()
if ret == "ERROR":
print_with_color("ERROR: back execution failed", "red")
break
doc = res[-1]
doc_name = resource_id + ".txt"
doc_path = os.path.join(docs_dir, doc_name)
if os.path.exists(doc_path):
doc_content = ast.literal_eval(open(doc_path).read())
if doc_content[act_name]:
print_with_color(f"Documentation for the element {resource_id} already exists.", "yellow")
continue
else:
doc_content = {
"tap": "",
"text": "",
"v_swipe": "",
"h_swipe": "",
"long_press": ""
}
doc_content[act_name] = doc
with open(doc_path, "w") as outfile:
outfile.write(str(doc_content))
doc_count += 1
print_with_color(f"Documentation generated and saved to {doc_path}", "yellow")
else:
print_with_color(f"ERROR: Undefined decision! {decision}", "red")
break
else:
print_with_color(rsp["error"]["message"], "red")
break
time.sleep(configs["REQUEST_INTERVAL"])
- 循环探索: 在每一轮探索中,工具会执行以下步骤:
- 获取当前屏幕截图和 UI 元素的 XML 描述。
- 解析 XML,获取可点击和可聚焦的元素列表。
- 使用大语言模型生成下一步操作指令。
- 执行操作(如点击、滑动、输入等)。
- 获取操作后的屏幕截图。
- 使用大语言模型评估操作效果,并生成文档。
2.9 结束探索
if task_complete:
print_with_color(f"Autonomous exploration completed successfully. {doc_count} docs generated.", "yellow")
elif round_count == configs["MAX_ROUNDS"]:
print_with_color(f"Autonomous exploration finished due to reaching max rounds. {doc_count} docs generated.",
"yellow")
else:
print_with_color(f"Autonomous exploration finished unexpectedly. {doc_count} docs generated.", "red")
- 根据探索结果输出相应的提示信息。
3. 示例
(appagent) D:\mobile agent\AppAgent\scripts>python self_explorer.py --app weather
<class 'os._Environ'>
<class 'dict'>
Warning! No module named 'sounddevice'
Warning! No module named 'matplotlib'
Warning! No module named 'keras'
List of devices attached:
['YD5PMBIJZ5LBYLLB']
Device selected: YD5PMBIJZ5LBYLLB
Screen resolution of YD5PMBIJZ5LBYLLB: 1220x2712
Please enter the description of the task you want me to complete in a few sentences:
check the 15-day forecast
Round 1
Thinking about what to do in the next step...
Request cost is $0.02
Observation:
The screenshot shows a weather app interface displaying the current weather in Pengjiang with an overcast sky, temperature of 18°, and AQI of 49. There are also forecasts for today, tomorrow, and Saturday shown below. At the bottom, there is a button labeled "15-day forecast".
Thought:
To check the 15-day forecast, I need to tap the button labeled "15-day forecast".
Action:
tap(8)
Summary:
I tapped the "15-day forecast" button to view the 15-day weather forecast.
Reflecting on my previous action...
Request cost is $0.04
Decision:
SUCCESS
Thought:
The action successfully moved the task forward as it opened the 15-day forecast view, allowing the user to see the weather predictions for the next 15 days.
Documentation:
The "15-day forecast" button opens a detailed 15-day weather forecast screen.
Documentation generated and saved to ./apps\weather\auto_docs\com.miui.weather2.id_daily_forecast_com.miui.weather2.id_daily_forecast_more_6.txt
Round 2
Thinking about what to do in the next step...
Request cost is $0.02
Observation:
The screen displays a 15-day weather forecast with temperatures, weather icons, and wind forces for each day. The days shown are from Yesterday (12/25) to Sun (12/29). There are also navigation arrows at the top left and right.
Thought:
Since the 15-day forecast is already displayed, the task is completed.
Action:
FINISH
Summary:
I tapped the "15-day forecast" button to view the 15-day weather forecast and confirmed that the forecast is displayed.
Autonomous exploration completed successfully. 1 docs generated.
生成的目录结构: