Python爬虫:playwright的使用
1. 获取元素的文本方法:inner_text()
# 1. 获取元素的文本方法:inner_text()
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
text = page.locator("h1").inner_text()
print("获取的文本:", text)
browser.close()
# 通过 inner_text() 方法获取页面中指定元素的文本内容。例如,获取 <h1> 标签的文本。
# 2. 获取所有匹配元素的文本方法:all_inner_texts()
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
texts = page.locator("p").all_inner_texts()
print("获取的所有文本:", texts)
browser.close()
# 使用 all_inner_texts() 获取所有匹配元素的文本内容,例如获取页面中所有 <p> 标签文本。
# 3. 获取匹配元素的属性值方法:get_attribute()
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
img_src = page.locator("img").get_attribute("src")
print("图片的 src 属性:", img_src)
browser.close()
#通过 get_attribute() 获取指定元素的属性值,例如获取 <img> 标签的 src 属性
# 4.获取 input 元素的 value 属性值方法:input_value()
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
value = page.locator("#user").input_value()
print("输入框的值:", value)
browser.close()
# 使用 input_value() 获取输入框的值,例如获取 <input id="user"> 的值。
# 5. 获取元素的位置方法:bounding_box()
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
rect = page.locator("#box11").bounding_box()
print("元素的位置:", rect)
browser.close()
# 通过 bounding_box() 获取元素的位置和尺寸信息,例如获取 <div id="box11"> 的位置。
# 6. 统计匹配到的元素个数方法:count()
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
count = page.locator("li").count()
print("匹配的元素个数:", count)
browser.close()
# 使用 count() 统计页面中匹配到的元素数量,例如统计所有 <li> 标签的数量。
# 在自动化测试和数据抓取中,等待机制是确保操作成功的关键。
# Playwright 提供了强大的等待功能,帮助开发者灵活应对各种场景。
# 等待机制Playwright 默认提供了智能等待机制,等待时间为 30 秒(30000 毫秒)。你可以根据需要调整等待时间或使用特定的等待条件。
# 7. 修改默认的等待时间方法:set_default_timeout(timeout)
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.set_default_timeout(5000) # 设置默认等待时间为 5000 毫秒
page.goto("https://example.com")
browser.close()
# 通过 set_default_timeout() 方法设置全局等待时间,单位为毫秒。例如,将默认等待时间设置为 5000 毫秒。
# 8. 等待页面处于某种状态方法:wait_for_load_state(state)
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.wait_for_load_state("load") # 等待页面加载完成
print("页面加载完成!")
browser.close()
"""
使用 wait_for_load_state() 等待页面加载状态达到指定条件。支持的状态包括:
'load':等待所有资源加载完成。
'domcontentloaded':等待 HTML 文档解析完成。
'networkidle':等待网络空闲(不推荐使用)。
"""
# 9. 等待页面跳转到指定的 URL方法:wait_for_url(url)
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.wait_for_url("**/target.html") # 等待页面跳转到 target.html
print("页面跳转完成!")
browser.close()
# 通过 wait_for_url() 等待页面跳转到指定的 URL。可以使用通配符 ** 匹配路径。
# 10. 等待指定的时间方法:wait_for_timeout(timeout)
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.wait_for_timeout(10000) # 等待 10000 毫秒
print("等待完成!")
browser.close()
# 使用 wait_for_timeout() 暂停执行指定时间,单位为毫秒。例如,暂停 10000 毫秒。
# 11. 鼠标操作,鼠标点击,方法:page.mouse.click(x, y)
"""
通过 page.mouse.click() 方法模拟鼠标点击操作,指定点击的位置、按钮类型、点击次数和延迟时间。
参数:
x:X轴位置
y:Y轴位置
button:left、right、middle
click_count:点击次数,默认值为 1
delay:点击按下到释放的时间(以毫秒为单位)
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.mouse.click(100, 200, button="left", click_count=1, delay=100)
print("鼠标点击完成!")
browser.close()
# 12. 按下鼠标方法:page.mouse.down()
"""
参数:
x:X轴位置
y:Y轴位置
button:left、right、middle
通过 page.mouse.down() 方法模拟鼠标按下操作,通常需要先移动鼠标到指定位置。
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.mouse.move(100, 200)
page.mouse.down(button="left")
print("鼠标按下完成!")
browser.close()
# 13. 移动鼠标方法:page.mouse.move()
"""
参数:
x:X轴位置
y:Y轴位置
通过 page.mouse.move() 方法移动鼠标到指定位置。
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.mouse.move(100, 200)
print("鼠标移动完成!")
browser.close()
# 14. 释放鼠标方法:page.mouse.up()
"""
参数:
button:left、right、middle
click_count:点击次数,默认值为 1
通过 page.mouse.up() 方法释放鼠标按钮,通常与 page.mouse.down() 配合使用。
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.mouse.move(100, 200)
page.mouse.down(button="left")
page.mouse.up(button="left")
print("鼠标释放完成!")
browser.close()
# 15. 元素拖拽综合方法方法:page.drag_and_drop(source, target)
"""
参数:
source:拖拽元素的定位表达式
target:目标元素的定位表达式
通过 page.drag_and_drop() 方法实现元素的拖拽操作,指定拖拽的源元素和目标元素。
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.drag_and_drop('//*[@id="box22"]', '//*[@id="box11"]')
print("元素拖拽完成!")
browser.close()
# 16. 鼠标滚轮滚动 方法:page.mouse.wheel(delta_x, delta_y)
"""
参数:
delta_x:要水平滚动的像素
delta_y:要垂直滚动的像素
通过 page.mouse.wheel() 方法模拟鼠标滚轮滚动,指定水平和垂直滚动的像素值。
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://example.com")
page.mouse.wheel(0, 100) # 向下滚动 100 像素
print("鼠标滚动完成!")
browser.close()
# 17 使用 Playwright 实现拖拽操作案例代码
"""
启动浏览器和页面:使用 launch() 启动浏览器,并通过 new_page() 创建一个新页面。
加载页面:使用 goto() 加载本地 HTML 文件。
等待元素加载:使用 wait_for_selector() 确保拖拽和目标元素已经加载完成。
执行拖拽操作:使用 drag_and_drop() 方法实现拖拽操作,指定拖拽的源元素和目标元素。
关闭浏览器:完成操作后关闭浏览器。Playwright 提供了强大的鼠标操作功能,尤其是 drag_and_drop 方法,可以轻松实现复杂的拖拽操作
"""
from playwright.sync_api import sync_playwright
def run_drag_and_drop():
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("file:///path/to/your/drag_and_drop.html") # 替换为你的本地文件路径
page.wait_for_selector("#box22")
page.wait_for_selector("#box11")
# 拖拽操作
page.drag_and_drop("#box22", "#box11")
print("拖拽操作完成!")
browser.close()
# 调用函数
run_drag_and_drop()
# 18 按下键盘按键方法:keyboard.down(key)
"""
通过 keyboard.down() 方法模拟按下指定按键,例如按下 Enter 键。
参数:
key:要按下的键的名称或要生成的字符
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://www.toptal.com/developers/keycode")
page.wait_for_timeout(2000)
page.keyboard.down("Enter") # 按下 Enter 键
page.wait_for_timeout(2000)
browser.close()
# 19 松开键盘 按键方法:keyboard.up(key)
"""
通过 keyboard.press() 方法模拟按下并松开指定按键,例如按下并松开 A 键。
参数:
key:要松开的键的名称或要生成的字符
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://www.toptal.com/developers/keycode")
page.wait_for_timeout(2000)
page.keyboard.press("A") # 按下并松开 A 键
page.wait_for_timeout(2000)
browser.close()
# 20 使用键盘进行文本输入方法:keyboard.type(text)
"""
通过 keyboard.type() 方法在焦点元素中输入文本
参数:
text:要输入到焦点元素中的文本
"""
from playwright.sync_api import sync_playwright
with sync_playwright() as p:
browser = p.chromium.launch(headless=False)
page = browser.new_page()
page.goto("https://www.baidu.com")
page.wait_for_timeout(2000)
page.locator("#kw").focus() # 定位到百度搜索框
page.wait_for_timeout(2000)
page.keyboard.type("1234567890") # 输入文本
page.wait_for_timeout(2000)
browser.close()