简单爬虫--框架
简单爬虫
import requests
import re
import chardet
# 模拟浏览器的请求头
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}
# 发送 HTTP 请求获取百度首页内容
url = "https://www.163.com"
response = requests.get(url, headers=headers)
# 自动检测编码
encoding = chardet.detect(response.content)["encoding"]
response.encoding = encoding
# 检查请求是否成功
if response.status_code == 200:
# 获取网页内容
html_content = response.text
# print(html_content)
# 使用正则表达式提取标题
title_match = re.search(r"<title>(.*?)</title>", html_content, re.IGNORECASE)
if title_match:
title = title_match.group(1)
print(f"网页标题: {title}")
else:
print("未找到标题")
else:
print(f"请求失败,状态码: {response.status_code}")