当前位置：首页 > article >正文

Playwright 自动化测试与爬虫快速入门指南

article 2025/2/22 16:54:19

1. 环境配置

# 安装 Playwright
pip install playwright

# 安装浏览器驱动
playwright install

2. 基础用法

2.1 基本结构

from playwright.sync_api import sync_playwright

def main():
    with sync_playwright() as p:
        # 启动浏览器，headless=False 可以看到浏览器界面
        browser = p.chromium.launch(headless=False)
        context = browser.new_context()
        page = browser.new_page()

        # 访问网页
        page.goto('https://example.com')

        # 关闭浏览器
        browser.close()

if __name__ == '__main__':
    main()

2.2 元素定位方法

# 1. 使用 Playwright Inspector（推荐）
# python -m playwright codegen https://example.com

# 2. 常用选择器
page.click('text=按钮文字')           # 文本选择器
page.click('role=button')             # 角色选择器
page.click('.class-name')             # CSS 选择器
page.click('#id-name')                # ID 选择器
page.click('[data-testid=test-id]')   # 测试 ID 选择器
page.click('xpath=//button')          # XPath 选择器

# 3. 组合选择器
page.click('.container >> text=点击这里')

2.3 基本操作

# 点击
page.click('button')

# 输入文本
page.fill('input[name="username"]', '用户名')

# 等待元素
page.wait_for_selector('.loading', state='hidden')

# 获取文本
text = page.text_content('.content')

# 截图
page.screenshot(path='screenshot.png')

3. 反爬虫策略

3.1 基础配置

def create_stealth_browser():
    with sync_playwright() as p:
        browser = p.chromium.launch(
            headless=True,
            args=[
                '--disable-blink-features=AutomationControlled',
                '--disable-infobars',
                '--window-size=1920,1080',
                '--start-maximized'
            ]
        )

        context = browser.new_context(
            viewport={'width': 1920, 'height': 1080},
            user_agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36',
            java_script_enabled=True,
            ignore_https_errors=True
        )

        return browser, context

3.2 高级反爬策略

async def setup_stealth_page(context):
    page = await context.new_page()

    # 注入 JavaScript 以修改浏览器特征
    await page.add_init_script("""
        Object.defineProperty(navigator, 'webdriver', {
            get: () => undefined
        });
    """)

    # 设置地理位置
    await context.grant_permissions(['geolocation'])
    await page.set_geolocation({"latitude": 40.71, "longitude": -74.01})

    # 添加随机延时
    await page.set_default_timeout(random.randint(30000, 60000))

    return page

# 随机延时函数
async def random_sleep():
    await asyncio.sleep(random.uniform(2, 5))

3.3 代理设置

def create_proxy_context(playwright):
    return playwright.chromium.launch(proxy={
        "server": "http://proxy-server:port",
        "username": "user",
        "password": "pass"
    })

3.4 Cookie 和 Session 管理

# 保存 Cookie
storage = context.storage_state(path="auth.json")

# 使用已保存的 Cookie
context = browser.new_context(storage_state="auth.json")

4. 高级功能

4.1 请求拦截

def handle_route(route):
    if route.request.resource_type == "image":
        route.abort()  # 阻止加载图片
    else:
        route.continue_()

page.route("**/*", handle_route)

4.2 监听网络请求

def log_request(request):
    print(f"URL: {request.url}")
    print(f"Method: {request.method}")
    print(f"Headers: {request.headers}")

page.on('request', log_request)

4.3 异步模式

from playwright.async_api import async_playwright
import asyncio

async def main():
    async with async_playwright() as p:
        browser = await p.chromium.launch()
        page = await browser.new_page()
        await page.goto('https://example.com')
        await browser.close()

asyncio.run(main())

5. 实战示例

5.1 登录自动化

async def login(page, username, password):
    await page.goto('https://example.com/login')
    await page.fill('input[name="username"]', username)
    await page.fill('input[name="password"]', password)

    # 等待验证码加载（如果有）
    await page.wait_for_selector('.captcha-image')

    # 处理验证码（示例）
    captcha = await solve_captcha(page)
    await page.fill('input[name="captcha"]', captcha)

    await page.click('button[type="submit"]')
    await page.wait_for_navigation()

5.2 数据采集

async def scrape_data(page):
    data = []

    # 随机延时
    await random_sleep()

    # 获取数据
    elements = await page.query_selector_all('.item')
    for element in elements:
        title = await element.text_content()
        data.append({
            'title': title,
            'timestamp': datetime.now()
        })

    return data

6. 最佳实践

错误处理

try:
    await page.click('button')
except TimeoutError:
    print("元素未找到")
except Exception as e:
    print(f"发生错误: {e}")

性能优化

# 禁用图片和样式表加载
await context.route('**/*.{png,jpg,jpeg,gif,css}', lambda route: route.abort())

# 设置请求超时
page.set_default_navigation_timeout(30000)

定期清理资源

# 定期清理浏览器上下文
async def cleanup():
    contexts = browser.contexts
    for context in contexts:
        await context.close()
    await browser.close()