当前位置：首页 > article >正文

利用PHP和phpSpider进行图片爬取及下载

article 2025/3/11 15:00:14

利用PHP和phpSpider进行图片爬取及下载，可以遵循以下步骤。phpSpider是一个开源的PHP爬虫框架，它可以帮助你轻松地抓取网页内容。以下是一个基本的步骤指南：

1. 安装phpSpider

首先，你需要确保你已经安装了Composer（PHP的依赖管理工具），然后使用Composer来安装phpSpider。

composer require phpspider/phpspider

2. 创建爬虫项目

创建一个新的PHP文件，例如 image_crawler.php，并在其中引入phpSpider的依赖。

<?php
require 'vendor/autoload.php';

use phpspider\core\Spider;
use GuzzleHttp\Client;

class ImageSpider extends Spider {
    // 初始化爬虫
    public function __construct($url) {
        $this->setUrlRules([$url], "allow");  // 设置允许的URL规则
        $this->addFields([  // 定义需要抓取的字段
            'image_urls' => ['css' => 'img[src]', 'method' => 'src'],
        ]);
        $this->setContentLimit(10 * 1024 * 1024);  // 设置内容抓取限制，例如10MB
        $this->setThreadCount(5);  // 设置抓取线程数
    }

    // 抓取完成后的回调函数
    public function parse($content, $url) {
        $imageUrls = $content->getFields('image_urls');
        foreach ($imageUrls as $imageUrl) {
            $this->downloadImage($imageUrl);
        }
    }

    // 下载图片
    private function downloadImage($imageUrl) {
        $client = new Client();
        $response = $client->get($imageUrl);

        if ($response->getStatusCode() === 200) {
            $body = $response->getBody();
            $imageContent = $body->getContents();
            $imageName = basename($imageUrl);
            $filePath = 'downloads/' . $imageName;
            
            file_put_contents($filePath, $imageContent);
            echo "Downloaded: $filePath\n";
        } else {
            echo "Failed to download: $imageUrl\n";
        }
    }
}

// 实例化并启动爬虫
$spider = new ImageSpider('https://example.com');  // 替换为你要抓取的网页URL
$spider->start();