当前位置：首页 > article >正文

简单爬虫--框架

article 2025/3/17 20:04:31

简单爬虫

import requests
import re
import chardet

# 模拟浏览器的请求头
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# 发送 HTTP 请求获取百度首页内容
url = "https://www.163.com"
response = requests.get(url, headers=headers)

# 自动检测编码
encoding = chardet.detect(response.content)["encoding"]
response.encoding = encoding

# 检查请求是否成功
if response.status_code == 200:
    # 获取网页内容
    html_content = response.text
    # print(html_content)

    # 使用正则表达式提取标题
    title_match = re.search(r"<title>(.*?)</title>", html_content, re.IGNORECASE)
    if title_match:
        title = title_match.group(1)
        print(f"网页标题: {title}")
    else:
        print("未找到标题")
else:
    print(f"请求失败，状态码: {response.status_code}")