当前位置: 首页 > article >正文

Python Selenium 图片资源自动搜索保存 项目实践

实现访问首页

from os.path import dirname

from selenium import webdriver


class ImageAutoSearchAndSave:
    """图片自动搜索保存"""
    def __init__(self):
        """初始化"""
        self.driver = webdriver.Chrome(executable_path=dirname(__file__) + "/chromedriver.exe")

    def run(self):
        """开始运行"""
        print("=======开始=======")
        # 访问首页
        self.driver.get("https://pixabay.com/")
        print("=======结束=======")


if __name__ == '__main__':
    ImageAutoSearchAndSave().run()

启动后会自动打开一个页面
在这里插入图片描述

实现图片自动欧索

from os.path import dirname

from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class ImageAutoSearchAndSave:
    """图片自动搜索保存"""

    def __init__(self, keyword):
        """初始化"""
        self.keyword = keyword
        self.driver = webdriver.Chrome(executable_path=dirname(__file__) + "/chromedriver.exe")

    def run(self):
        """开始运行"""
        print("=======开始=======")
        # 访问首页
        self.driver.get("https://pixabay.com/")
        # 搜索图片
        self._search_image()
        print("=======结束=======")

    def _search_image(self):
        '''搜索图片'''
        elem = self.driver.find_element_by_css_selector("input[name]")
        elem.send_keys(self.keyword + Keys.ENTER)


if __name__ == '__main__':
    keyword = "cat"
    ImageAutoSearchAndSave(keyword).run()

遍历所有图片列表页面

页面分析

第一页
在这里插入图片描述
第二页
在这里插入图片描述
由此可得出变化的只有这里,根据pagi= 展示不同页面
在这里插入图片描述
红色箭头定位到页数,绿色的不要使用 是反爬虫的限制,不断变化的
在这里插入图片描述

from os.path import dirname

from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class ImageAutoSearchAndSave:
    """图片自动搜索保存"""

    def __init__(self, keyword):
        """初始化"""
        self.keyword = keyword
        self.driver = webdriver.Chrome(executable_path=dirname(__file__) + "/chromedriver.exe")

    def run(self):
        """开始运行"""
        print("=======开始=======")
        # 访问首页
        self.driver.get("https://pixabay.com/")
        # 搜索图片
        self._search_image()
        # 遍历所有页面
        self._iter_all_page()


        print("=======结束=======")

    def _search_image(self):
        '''搜索图片'''
        elem = self.driver.find_element_by_css_selector("input[name]")
        elem.send_keys(self.keyword + Keys.ENTER)

    def _iter_all_page(self):
        '''遍历所有页面'''
        # 获取总页面
        elem = self.driver.find_element_by_css_selector("input[class^=pageInput]")
        page_total = int(elem.text.strip("/ "))
        # 遍历所有页面
        base_url = self.driver.current_url
        for page_num in range(1,page_total+1):
            if page_num>1:
                self.driver.get(f"{base_url}?pagi={page_num}&")
if __name__ == '__main__':
    keyword = "sunflower"
    ImageAutoSearchAndSave(keyword).run()

获取所有图片详情页链接

在这里插入图片描述

from os.path import dirname

from lxml.etree import HTML
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class ImageAutoSearchAndSave:
    """图片自动搜索保存"""

    def __init__(self, keyword):
        """初始化"""
        self.keyword = keyword
        self.all_detail_link = []
        self.driver = webdriver.Chrome(executable_path=dirname(__file__) + "/chromedriver.exe")

    def run(self):
        """开始运行"""
        print("=======开始=======")
        # 访问首页
        self.driver.get("https://pixabay.com/")
        # 搜索图片
        self._search_image()
        # 遍历所有页面
        self._iter_all_page()

        print("=======结束=======")

    def _search_image(self):
        '''搜索图片'''
        elem = self.driver.find_element_by_css_selector("input[name]")
        elem.send_keys(self.keyword + Keys.ENTER)

    def _iter_all_page(self):
        '''遍历所有页面'''
        # 获取总页面
        elem = self.driver.find_element_by_css_selector("input[class^=pageInput]")
        page_total = int(elem.text.strip("/ "))
        # 遍历所有页面
        base_url = self.driver.current_url
        for page_num in range(1, page_total + 1):
            if page_num > 1:
                self.driver.get(f"{base_url}?pagi={page_num}&")
            # href 属性
            root = HTML(self.driver.page_source)
            # a标签中的href属性
            detail_links = root.xpath("//div[start-with(@class,'result')]//a[start-with(@class,'link')]/@href")
            for detail_link in detail_links:
                self.all_detail_link.append(detail_link)


if __name__ == '__main__':
    keyword = "sunflower"
    ImageAutoSearchAndSave(keyword).run()

增加下载数量的限制

在这里插入图片描述

from os.path import dirname

from lxml.etree import HTML
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class ImageAutoSearchAndSave:
    """图片自动搜索保存"""

    def __init__(self, keyword,limit=0):
        """初始化"""
        self.keyword = keyword
        self.all_detail_link = []
        self.limit=limit #0表示没有限制
        self.count = 0 #用来计数
        self.driver = webdriver.Chrome(executable_path=dirname(__file__) + "/chromedriver.exe")

    def run(self):
        """开始运行"""
        print("=======开始=======")
        # 访问首页
        self.driver.get("https://pixabay.com/")
        # 搜索图片
        self._search_image()
        # 遍历所有页面
        self._iter_all_page()

        print("=======结束=======")

    def _search_image(self):
        '''搜索图片'''
        elem = self.driver.find_element_by_css_selector("input[name]")
        elem.send_keys(self.keyword + Keys.ENTER)

    def _iter_all_page(self):
        '''遍历所有页面'''
        # 获取总页面
        elem = self.driver.find_element_by_css_selector("input[class^=pageInput]")
        # page_total = int(elem.text.strip("/ "))
        page_total = 5
        # 遍历所有页面
        base_url = self.driver.current_url
        for page_num in range(1, page_total + 1):
            if page_num > 1:
                self.driver.get(f"{base_url}?pagi={page_num}&")
            # href 属性
            root = HTML(self.driver.page_source)
            is_reach_limit = False
            # 获取每一页详情链接   a标签中的href属性
            detail_links = root.xpath("//div[start-with(@class,'result')]//a[start-with(@class,'link')]/@href")
            for detail_link in detail_links:
                self.all_detail_link.append(detail_link)
                self.count += 1
                if self.limit >0 and self.count == self.limit:
                    is_reach_limit = True
                    break
            if is_reach_limit:
                break
if __name__ == '__main__':
    keyword = "sunflower"
    limit = 3
    ImageAutoSearchAndSave(keyword,limit).run()

获取所有图片详情、获取图片下载链接

在这里插入图片描述

from os.path import dirname

from lxml.etree import HTML
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class ImageAutoSearchAndSave:
    """图片自动搜索保存"""

    def __init__(self, keyword, limit=0):
        """初始化"""
        self.keyword = keyword
        self.all_detail_link = []
        self.limit = limit  # 0表示没有限制
        self.count = 0  # 用来计数
        self.all_download_link = [] # 图片详情的下载链接
        self.driver = webdriver.Chrome(executable_path=dirname(__file__) + "/chromedriver.exe")

    def run(self):
        """开始运行"""
        print("=======开始=======")
        # 访问首页
        self.driver.get("https://pixabay.com/")
        # 搜索图片
        self._search_image()
        # 遍历所有页面
        self._iter_all_page()
        # 访问图片详情页
        self._visit_image_detail()
        # 释放资源
        self.driver.close()
        del self.all_detail_link
        print("=======结束=======")

    def _search_image(self):
        '''搜索图片'''
        elem = self.driver.find_element_by_css_selector("input[name]")
        elem.send_keys(self.keyword + Keys.ENTER)

    def _iter_all_page(self):
        '''遍历所有页面'''
        # 获取总页面
        elem = self.driver.find_element_by_css_selector("input[class^=pageInput]")
        # page_total = int(elem.text.strip("/ "))
        page_total = 5
        # 遍历所有页面
        base_url = self.driver.current_url
        for page_num in range(1, page_total + 1):
            if page_num > 1:
                self.driver.get(f"{base_url}?pagi={page_num}&")
            # href 属性
            root = HTML(self.driver.page_source)
            is_reach_limit = False
            # 获取每一页详情链接   a标签中的href属性
            detail_links = root.xpath("//div[start-with(@class,'result')]//a[start-with(@class,'link')]/@href")
            for detail_link in detail_links:
                self.all_detail_link.append(detail_link)
                self.count += 1
                if self.limit > 0 and self.count == self.limit:
                    is_reach_limit = True
                    break
            if is_reach_limit:
                break

    def _visit_image_detail(self):
        '''访问图片详情页'''
        for detail_link in self.all_detail_link:
            self.driver.get(detail_link)
            elem = self.driver.find_element_by_css_selector("#media_container > picture > img")
            download_link = elem.get_attribute("src")
            self.all_download_link.append(download_link)

if __name__ == '__main__':
    keyword = "sunflower"
    limit = 3
    ImageAutoSearchAndSave(keyword, limit).run()

下载所有图片

from io import BytesIO
from os import makedirs
from os.path import dirname
from time import strftime

import requests
from PIL import Image
from lxml.etree import HTML
from selenium import webdriver
from selenium.webdriver.common.keys import Keys


class ImageAutoSearchAndSave:
    """图片自动搜索保存"""

    def __init__(self, keyword, limit=0):
        """初始化"""
        self._keyword = keyword
        self.all_detail_link = []
        self._limit = limit  # 0表示没有限制
        self._count = 0  # 用来计数
        self.all_download_link = [] # 图片详情的下载链接
        self._driver = webdriver.Chrome(executable_path=dirname(__file__) + "/chromedriver.exe")

    def run(self):
        """开始运行"""
        print("=======开始=======")
        # 访问首页
        self._driver.get("https://pixabay.com/")
        # 搜索图片
        self._search_image()
        # 遍历所有页面
        self._iter_all_page()
        # 访问图片详情页
        self._visit_image_detail()
        # 释放资源
        self._driver.close()
        del self.all_detail_link
        
        # 下载所有图片
        self._download_all_image()
        print("=======结束=======")

    def _search_image(self):
        '''搜索图片'''
        elem = self._driver.find_element_by_css_selector("input[name]")
        elem.send_keys(self._keyword + Keys.ENTER)

    def _iter_all_page(self):
        '''遍历所有页面'''
        # 获取总页面
        elem = self._driver.find_element_by_css_selector("input[class^=pageInput]")
        # page_total = int(elem.text.strip("/ "))
        page_total = 5
        # 遍历所有页面
        base_url = self._driver.current_url
        for page_num in range(1, page_total + 1):
            if page_num > 1:
                self._driver.get(f"{base_url}?pagi={page_num}&")
            # href 属性
            root = HTML(self._driver.page_source)
            is_reach_limit = False
            # 获取每一页详情链接   a标签中的href属性
            detail_links = root.xpath("//div[start-with(@class,'result')]//a[start-with(@class,'link')]/@href")
            for detail_link in detail_links:
                self.all_detail_link.append(detail_link)
                self._count += 1
                if self._limit > 0 and self._count == self._limit:
                    is_reach_limit = True
                    break
            if is_reach_limit:
                break

    def _visit_image_detail(self):
        '''访问图片详情页 获取对应的图片链接'''
        for detail_link in self.all_detail_link:
            self._driver.get(detail_link)
            elem = self._driver.find_element_by_css_selector("#media_container > picture > img")
            download_link = elem.get_attribute("src")
            self.all_download_link.append(download_link)

    def _download_all_image(self):
        '''下载所有图片'''
        download_dir = f"{dirname(__file__)}/download/{strftime('%Y%m%d-%H%M%S')}-{self._keyword}"
        makedirs(download_dir)
        #下载所有图片
        count = 0
        for download_link in self.all_download_link:
            response = requests.get(download_link)
            count += 1
            # response.content 二进制内容  response.text 文本内容
            image = Image.open(BytesIO(response.content))
            # rjust  000001.png
            filename = f"{str(count).rjust(6,'0')}.png"
            file_path = f"{download_dir}/{filename}"
            image.save(file_path)

if __name__ == '__main__':
    keyword = "sunflower"
    limit = 3
    ImageAutoSearchAndSave(keyword, limit).run()

可以适当的进行优化,使用selnium的页面加载策略

在这里插入图片描述


http://www.kler.cn/a/147471.html

相关文章:

  • 【网络协议】开放式最短路径优先协议OSPF详解(四)
  • springboot + vue+elementUI图片上传流程
  • 什么是 ES6 “模板语法” ?
  • xss-labs关卡记录15-20关
  • 深入理解 React 中 setState 的行为及状态更新时机
  • 智能工厂的设计软件 应用场景的一个例子: 为AI聊天工具添加一个知识系统 之24 重审 前端实现:主页页面
  • 【Java程序员面试专栏 专业技能篇】Java SE核心面试指引(三):核心机制策略
  • 使用物联网的家庭自动化
  • 开源和闭源软件对开发的影响
  • 振弦式轴力计和振弦采集仪组成的安全监测解决方案
  • Mysql数据库多表数据查询问题
  • Spring Boot配置文件 Spring日志文件相关的知识
  • 机器视觉:塑造未来的智能视界
  • AI超级个体:ChatGPT与AIGC实战指南
  • 马卡龙产业分析:全球市场规模约19.3亿美元
  • Unity 后期特效用到的一些方法:OnRenderImage Blit
  • illuminate/database 使用 四
  • 第71讲:MySQL锁机制详解:表级锁、元数据锁和意向锁的全面解析与实践指南
  • python实现自动刷平台学时
  • Vue 2.0源码分析-渲染函数render
  • 【密码学引论】分组密码
  • css之选择第一个或最后一个元素、第n个标签、选择偶数或奇数标签、选择最后n个标签、等差数列标签的选择、first、last、nth、child
  • 学习Qt的网站
  • git突然失效:无法提交的问题 无法推送到远程仓库
  • 为什么API管理工具对开发人员有益?
  • Vue中的深度监听Deep Watch