Scrapy和Selenium结合使用完整步骤
Scrapy和Selenium结合使用完整步骤
一、环境安装
1. 安装Scrapy
在命令行执行以下指令:
pip install scrapy
2. 安装Selenium
pip install selenium
3. 安装scrapy_selenium
pip install scrapy_selenium
4. 安装 chrome-headless-shell 和 chromedriver
打开chrome浏览器 Google Chrome Testing 选择适合的版本
apt install -y libx11-dev libx264-dev libnss3 libgconf-2-4 libatk-bridge2.0-0 libatspi2.0-0 libcups2 libxcomposite1 libxrandr2 libayatana-appindicator3-1 libgbm1 libasound2 && \
cd /soft && \
wget https://storage.googleapis.com/chrome-for-testing-public/131.0.6778.204/linux64/chromedriver-linux64.zip && \
unzip -j chromedriver-linux64.zip -d chromedriver && \
cp chromedriver/chromedriver /usr/local/bin/chromedriver && \
wget https://storage.googleapis.com/chrome-for-testing-public/131.0.6778.204/linux64/chrome-headless-shell-linux64.zip && \
unzip -j chrome-headless-shell-linux64.zip -d chrome_headless_shell && \
ln -s /soft/chrome_headless_shell/chrome-headless-shell /usr/local/bin/chrome
查看chromedriver是否可运行
chromedriver --version
#出现版本号正常展示即代表安装成功
#ChromeDriver 131.0.6778.204 (52183f9**************53d256f1516f2a0-refs/branch-heads/6***_1*5@{#7})
二、Scrapy优化配置
1. 创建Scrapy项目
scrapy startproject product_collection
2. 创建普通模板爬虫
进入spiders目录,生成爬虫文件:
cd product_collection/product_collection/spiders
scrapy genspider items <url>
运行爬虫:
scrapy crawl items
3. 创建自动爬取多页数据的crawl模板
cd product_collection/product_collection/spiders
scrapy genspider -t crawl items <url>
scrapy crawl items
三、处理scrapy_selenium与Selenium版本不兼容问题
在scrapy_selenium最新版本0.0.7中,它与Selenium版本4.x不兼容。使用默认配置时,将报出下列错误:
TypeError: WebDriver.__init__() got an unexpected keyword argument 'executable_path'
解决方案(二选一)
1. 降低已安装的Selenium版本
使用Selenium 3.x和与之兼容的urllib3版本:
pip uninstall selenium
pip install 'selenium<4' #默认会安装selenium3.141.0版本
pip uninstall urllib3
pip install urllib3==1.26.2
注:urllib3高版本不兼容Selenium 3.x,所以需与之修改。
2. 修改scrapy_selenium插件源码
从GitHub上克隆scrapy_selenium的代码,重新修改并安装自己的版本:
源码地址:
https://github.com/clemfromspace/scrapy-selenium
Fork到自己仓库并clone到本地:
-
首先fork代码到自己的github仓库
https://github.com/clemfromspace/scrapy-selenium/fork
-
clone源码到本地并进行修改
git clone https://github.com/<your_username>/scrapy-selenium
修改middlewares.py
在scrapy_selenium/middlewares.py
中将有关代码修改为:
"""This module contains the ``SeleniumMiddleware`` scrapy middleware"""
from scrapy import signals
from scrapy.exceptions import NotConfigured
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from .http import SeleniumRequest
class SeleniumMiddleware:
"""Scrapy middleware handling the requests using selenium"""
def __init__(self, driver_name, command_executor, driver_arguments):
# def __init__(self, driver_name, driver_executable_path,
# browser_executable_path, command_executor, driver_arguments):
"""Initialize the selenium webdriver
Parameters
----------
driver_name: str
The selenium ``WebDriver`` to use
driver_executable_path: str
The path of the executable binary of the driver
driver_arguments: list
A list of arguments to initialize the driver
browser_executable_path: str
The path of the executable binary of the browser
command_executor: str
Selenium remote server endpoint
"""
driver_name = driver_name.lower().capitalize()
driver_options = getattr(webdriver, f"{driver_name}Options")()
for argument in driver_arguments:
driver_options.add_argument(argument)
if command_executor:
self.driver = webdriver.Remote(command_executor=command_executor,
options=driver_options)
else:
driver_class = getattr(webdriver, driver_name)
self.driver = driver_class(options=driver_options)
# webdriver_base_path = f'selenium.webdriver.{driver_name}'
# driver_klass_module = import_module(f'{webdriver_base_path}.webdriver')
# driver_klass = getattr(driver_klass_module, 'WebDriver')
# driver_options_module = import_module(f'{webdriver_base_path}.options')
# driver_options_klass = getattr(driver_options_module, 'Options')
# driver_options = driver_options_klass()
# if browser_executable_path:
# driver_options.binary_location = browser_executable_path
# for argument in driver_arguments:
# driver_options.add_argument(argument)
# driver_kwargs = {
# 'executable_path': driver_executable_path,
# f'{driver_name}_options': driver_options
# }
# # locally installed driver
# if driver_executable_path is not None:
# driver_kwargs = {
# 'executable_path': driver_executable_path,
# f'{driver_name}_options': driver_options
# }
# self.driver = driver_klass(**driver_kwargs)
# # remote driver
# elif command_executor is not None:
# from selenium import webdriver
# capabilities = driver_options.to_capabilities()
# self.driver = webdriver.Remote(command_executor=command_executor,
# desired_capabilities=capabilities)
@classmethod
def from_crawler(cls, crawler):
"""Initialize the middleware with the crawler settings"""
driver_name = crawler.settings.get('SELENIUM_DRIVER_NAME')
# driver_executable_path = crawler.settings.get('SELENIUM_DRIVER_EXECUTABLE_PATH')
# browser_executable_path = crawler.settings.get('SELENIUM_BROWSER_EXECUTABLE_PATH')
command_executor = crawler.settings.get('SELENIUM_COMMAND_EXECUTOR')
driver_arguments = crawler.settings.get('SELENIUM_DRIVER_ARGUMENTS')
if driver_name is None:
raise NotConfigured('SELENIUM_DRIVER_NAME must be set')
# if driver_executable_path is None and command_executor is None:
# raise NotConfigured('Either SELENIUM_DRIVER_EXECUTABLE_PATH '
# 'or SELENIUM_COMMAND_EXECUTOR must be set')
middleware = cls(
driver_name=driver_name,
# driver_executable_path=driver_executable_path,
# browser_executable_path=browser_executable_path,
command_executor=command_executor,
driver_arguments=driver_arguments
)
crawler.signals.connect(middleware.spider_closed, signals.spider_closed)
return middleware
def process_request(self, request, spider):
"""Process a request using the selenium driver if applicable"""
if not isinstance(request, SeleniumRequest):
return None
self.driver.get(request.url)
for cookie_name, cookie_value in request.cookies.items():
self.driver.add_cookie(
{
'name': cookie_name,
'value': cookie_value
}
)
if request.wait_until:
WebDriverWait(self.driver, request.wait_time).until(
request.wait_until
)
if request.screenshot:
request.meta['screenshot'] = self.driver.get_screenshot_as_png()
if request.script:
self.driver.execute_script(request.script)
body = str.encode(self.driver.page_source)
# Expose the driver via the "meta" attribute
request.meta.update({'driver': self.driver})
return HtmlResponse(
self.driver.current_url,
body=body,
encoding='utf-8',
request=request
)
def spider_closed(self):
"""Shutdown the driver when spider is closed"""
self.driver.quit()
修改setup.cfg
"""This module contains the packaging routine for the pybook package"""
from setuptools import setup, find_packages
try:
from pip._internal.network.session import PipSession
from pip._internal.req import parse_requirements
except ImportError:
# It is quick hack to support pip 10 that has changed its internal
# structure of the modules.
from pip._internal.network.session import PipSession
from pip._internal.req.req_file import parse_requirements
def get_requirements(source):
"""Get the requirements from the given ``source``
Parameters
----------
source: str
The filename containing the requirements
"""
install_reqs = parse_requirements(filename=source, session=PipSession())
return [str(ir.requirement) for ir in install_reqs]
setup(
packages=find_packages(),
install_requires=get_requirements('requirements/requirements.txt')
)
修改完成后,重新打包安装:
pip uninstall scrapy-selenium
pip install git+https://github.com/<your_username>/scrapy-selenium
四、Scrapy和Selenium配置
在Scrapy项目的settings.py
中添加下列配置:
# Selenium相关配置
# 使用 Chrome 浏览器
SELENIUM_DRIVER_NAME = 'chrome'
# Chrome浏览器 路径
SELENIUM_BROWSER_EXECUTABLE_PATH = '/path/to/chrome'
# 配置为无头浏览器模式
SELENIUM_DRIVER_ARGUMENTS = [
'--headless',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
#'--start-maximized',
'--window-size=1920x1080'
]
五、使用自定义Middleware实现Selenium支持
如果不想使用scrapy_selenium
插件,可以自己编写一个Middleware来实现Selenium支持。
1. 创建SeleniumMiddleware
在项目目录下的middlewares.py文件中,添加以下代码:
from scrapy.http import HtmlResponse
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
class SeleniumMiddleware:
def __init__(self):
chrome_options = Options()
chrome_options.add_argument("--headless") # 启用无头模式
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--start-maximized")
# 明确指定 Chrome 二进制文件路径
chrome_options.binary_location = ("/path/to/chrome-headless-shell")
self.driver = webdriver.Chrome(service=Service('/path/to/chromedriver'),
options=chrome_options)
@classmethod
def from_crawler(cls, crawler):
s = cls()
crawler.signals.connect(s.spider_closed, signal=signals.spider_closed)
return s
def process_request(self, request, spider):
if 'selenium' in request.meta:
#使用selenium打开请求的url
self.driver.get(request.url)
# 显示等待页面指定元素出现
WebDriverWait(self.driver, 10).until(
EC.presence_of_element_located((By.XPATH,
'<XPATH语法>'))
)
# 将滚动条拖到最底端
self.toBottom(self.driver)
body=self.driver.page_source
responseUrl = self.driver.current_url
# 关闭打开的页面
self.driver.close()
return HtmlResponse(
url=responseUrl,
body=body,
encoding='utf-8',
request=request
)
def spider_closed(self, spider):
#注销chrome实例
self.driver.quit()
def toBottom(self,driver):
driver.execute_script("document.documentElement.scrollTop = 100000")
2. 激活Middleware
在settings.py
中启用自定义的Middleware:
DOWNLOADER_MIDDLEWARES = {
'product_collection.middlewares.SeleniumMiddleware': 543,
}
3. 在爬虫中使用
在爬虫的请求中设置meta
属性:
yield scrapy.Request(url=<url>, meta={'selenium': True})
六、注意点
- 确保Scrapy、Selenium和指定源码版本兼容。
- Selenium需要应用最新的driver支持,如有问题,可使用
webdriver-manager
自动管理driver。 - 如要比较优化效率,可考虑在静态部分使用Scrapy,在动态内容使用Selenium。
通过上述步骤,您可以成功将Scrapy和Selenium结合使用,充分发挥两者的优势。