当前位置：首页 > article >正文

Python 爬虫技术指南

article 2025/3/1 0:00:57

Python 爬虫技术指南

一、基础工具

1. 请求库

# requests - 最常用的 HTTP 库
import requests

# 基本GET请求
response = requests.get('https://api.example.com/data')
print(response.text)

# POST请求
data = {'key': 'value'}
response = requests.post('https://api.example.com/post', json=data)

2. 解析库

# BeautifulSoup4 - HTML解析
from bs4 import BeautifulSoup

# 解析HTML
soup = BeautifulSoup(html_content, 'html.parser')
title = soup.find('title').text

# lxml - 高效的XML/HTML解析器
from lxml import etree
tree = etree.HTML(html_content)

二、高级技术

1. Selenium 自动化

from selenium import webdriver
from selenium.webdriver.common.by import By

# 初始化浏览器
driver = webdriver.Chrome()

# 访问页面
driver.get('https://example.com')

# 查找元素
element = driver.find_element(By.ID, 'search')
element.send_keys('python')

2. 异步爬虫

import aiohttp
import asyncio

async def fetch(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url) as response:
            return await response.text()

# 运行异步任务
async def main():
    urls = ['url1', 'url2', 'url3']
    tasks = [fetch(url) for url in urls]
    results = await asyncio.gather(*tasks)

三、反爬虫对策

1. 请求头处理

headers = {
    'User-Agent': 'Mozilla/5.0 ...',
    'Accept': 'text/html,application/xhtml+xml...',
    'Accept-Language': 'en-US,en;q=0.9',
    'Referer': 'https://example.com'
}

response = requests.get(url, headers=headers)

2. IP代理池

proxies = {
    'http': 'http://10.10.10.1:8000',
    'https': 'http://10.10.10.1:8000'
}

response = requests.get(url, proxies=proxies)

3. Cookie管理

from http.cookiejar import CookieJar
import requests

session = requests.Session()
cookies = {'session_id': '123456'}
session.cookies.update(cookies)

四、数据存储

1. 文件存储

# CSV存储
import csv

with open('data.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerows(data)

# JSON存储
import json

with open('data.json', 'w') as f:
    json.dump(data, f)

2. 数据库存储

# SQLite示例
import sqlite3

conn = sqlite3.connect('database.db')
cursor = conn.cursor()

# MongoDB示例
from pymongo import MongoClient

client = MongoClient('mongodb://localhost:27017/')
db = client['database_name']

五、并发处理

1. 多线程

from concurrent.futures import ThreadPoolExecutor
import threading

def crawl(url):
    # 爬虫逻辑
    pass

with ThreadPoolExecutor(max_workers=5) as executor:
    executor.map(crawl, urls)

2. 多进程

from multiprocessing import Pool

def crawl(url):
    # 爬虫逻辑
    pass

if __name__ == '__main__':
    with Pool(4) as p:
        p.map(crawl, urls)

六、高级特性

1. 验证码处理

# 使用OCR识别
import pytesseract
from PIL import Image

def recognize_captcha(image_path):
    image = Image.open(image_path)
    text = pytesseract.image_to_string(image)
    return text

2. JavaScript渲染

# 使用Splash处理JavaScript
import requests

splash_url = 'http://localhost:8050/render.html'
params = {'url': target_url, 'wait': 2}
response = requests.get(splash_url, params=params)

七、最佳实践

1. 错误处理

def safe_request(url, retries=3):
    for i in range(retries):
        try:
            response = requests.get(url, timeout=10)
            return response
        except requests.RequestException as e:
            print(f"Retry {i+1}, Error: {e}")
            if i == retries - 1:
                raise

2. 限速控制

import time
from ratelimit import limits, sleep_and_retry

@sleep_and_retry
@limits(calls=1, period=1)  # 1秒1次请求
def rate_limited_request(url):
    return requests.get(url)

八、监控和日志

1. 日志记录

import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    filename='crawler.log'
)

logging.info('Starting crawler...')

2. 性能监控

import time
from memory_profiler import profile

@profile
def memory_intensive_crawl():
    # 爬虫逻辑
    pass

九、项目结构

crawler/
├── config/
│   └── settings.py
├── spiders/
│   ├── __init__.py
│   └── spider.py
├── utils/
│   ├── proxy.py
│   └── parser.py
├── storage/
│   └── database.py
└── main.py

这个指南涵盖了Python爬虫开发的主要方面，从基础到高级特性。根据具体需求，可以选择合适的工具和技术组合使用。记住要遵守网站的robots.txt规则，合理控制爬取频率。

查看全文

http://www.kler.cn/a/446112.html