当前位置：首页 > article >正文

[数据采集技术：实践02]：requests，lxml，BeautifulSoup模块的使用

article 2024/10/22 22:37:25

要求

1.创建项目文件夹，命名为：学号-2，如：20220001-2。后续所有文
件均放在此文件夹中。

requests 模块和 lxml 模块中的 XPath的使用

2.新建 test1.py 文件，使用 requests 模块和 lxml 模块中的 XPath，
爬取豆瓣电影 Top250 中的电影信息。

test01

import requests
from lxml import etree

url = 'https://movie.douban.com/top250'

headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}

response = requests.get(url, headers=headers)

if response.status_code == 200:

    tree = etree.HTML(response.text)
    movies = tree.xpath('//div[@class="info"]')

    for movie in movies:

        title = movie.xpath('div[@class="hd"]/a/span[1]/text()')[0]
        rating_num = movie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
        quote = movie.xpath('div[@class="bd"]/p[@class=""]/text()')
        quote = quote[0] if quote else "无"

        print(f"电影标题: {title}")
        print(f"评分: {rating_num}")
        print(f"引言: {quote}\n")

else:
    print("请求失败，状态码:", response.status_code)

BeautifulSoup 模块的使用

3.新建 test2.py 文件，使用 BeautifulSoup 模块获取 HTML 页面中的
节点对应代码。

## test02
# test2.py
import requests
from bs4 import BeautifulSoup

# 示例 HTML 页面 URL（这里使用一个静态页面作为示例）
url = 'https://example.com'  # 请替换为实际的 HTML 页面 URL

# 发送 HTTP GET 请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
response = requests.get(url, headers=headers)

# 检查请求是否成功
if response.status_code == 200:
    # 解析 HTML 内容
    soup = BeautifulSoup(response.content, 'html.parser')

    # 查找特定的节点（这里以 <div class="example"> 为例）
    node = soup.find('div', class_='example')

    # 打印节点的 HTML 代码
    print("节点的 HTML 代码:")
    print(str(node))
else:
    print("请求失败，状态码:", response.status_code)

4.新建 test3.py 文件，使用 BeautifulSoup 模块获取 HTML 页面中的
节点的属性与文本内容。

 ## test03
 # test3.py
import requests
from bs4 import BeautifulSoup

# 示例 HTML 页面 URL（这里使用一个静态页面作为示例）
url = 'https://example.com'  # 请替换为实际的 HTML 页面 URL

# 发送 HTTP GET 请求
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36'
}
response = requests.get(url, headers=headers)

# 检查请求是否成功
if response.status_code == 200:
    # 解析 HTML 内容
    soup = BeautifulSoup(response.content, 'html.parser')

    # 查找特定的节点（这里以 <a> 标签为例）
    node = soup.find('a')

    # 打印节点的属性和文本内容
    print("节点的属性:")
    for attr, value in node.attrs.items():
        print(f"{attr}: {value}")

    print("\n节点的文本内容:")
    print(node.get_text())
else:
    print("请求失败，状态码:", response.status_code)