当前位置: 首页 > article >正文

初识爬虫3

1.cookies参数(浏览器的登录信息,需要设置,防止反爬机制检测)

        1.1 headers中设置cookies参数

# -*- coding: utf-8 -*-
import requests

url = 'https://github.com/'
# 构建请求字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    'Cookie': '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
}

response = requests.get(url, headers=headers)

with open('guitub_1.html', 'wb') as f:
    f.write(response.content)

        1.2 构建cookies字典

# -*- coding: utf-8 -*-
import requests

url = 'https://github.com/'
# 构建请求字典
headers = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
    # 'Cookie': '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
}

# 构建cookies字典
temp = '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'

# 方案一
# cookies_list = temp.split('; ')
# cookies = {}
# for cookie in cookies_list:
#     cookies[cookie.split('=')[0]] = cookie.split('=')[-1]
#
# response = requests.get(url, headers=headers, cookies=cookies)
#
#
# with open('guitub_2.html', 'wb') as f:
#     f.write(response.content)

# 方案二(工作中用)
cookies_list = temp.split('; ')
cookies = {cookie.split('=')[0] : cookie.split('=')[1] for cookie in cookies_list}
print(cookies)

response = requests.get(url, headers=headers, cookies=cookies)


# with open('guitub_3.html', 'wb') as f:
#     f.write(response.content)

2.cookiejar的处理(存储和管理 cookie) 

# -*- coding: utf-8 -*-
import requests

url = 'https://www.baidu.com'

response = requests.get(url)
print(response.cookies)

# 对cookiejar对象的处理,将cookiejar转换为字典:
dict_cookies = requests.utils.dict_from_cookiejar(response.cookies)
print(dict_cookies)

# 将字典转换回cookiejar
jar_cookies = requests.utils.cookiejar_from_dict(dict_cookies)
print(jar_cookies)

3. 模拟网络波动,timeout的使用

# -*- coding: utf-8 -*-
import requests

url = 'https://twitter.com'

response = requests.get(url, timeout=3)
print(response.cookies)


http://www.kler.cn/a/301860.html

相关文章:

  • Qwen2.5-Coder-32B-Instruct Docker 部署openai接口
  • 电脑长期不用,开不了机怎样解决
  • MySQL初学之旅(3)约束
  • CC6学习记录
  • 【route】route add命令详解
  • uniapp ios app以framwork形式接入sentry
  • 【区块链通用服务平台及组件】信息数据流转验真技术研究项目 | FISCO BCOS应用案例
  • HCIA--实验十一:单区域OSPF路由实验
  • 基于SpringBoot+Vue+MySQL的垃圾分类回收管理系统
  • 【菜菜的sklearn机器学习】(2)回归树
  • 设计模式 22 模板方法模式
  • electron 客户端 windows linux(麒麟V10)多系统离线打包 最新版 <一>
  • 【系统架构设计师】建造者模式(Builder Pattern)
  • Android通知——Notification
  • 【JUC并发编程系列】深入理解Java并发机制:从synchronized到CAS锁升级全过程(三、synchronized 前置知识)
  • SpringBoot教师招聘管理系统---附源码81097
  • ios调整启动图显示的时间
  • Java | Leetcode Java题解之第402题移掉K位数字
  • RabbitMQ 07 另两种集群方式 warren(主备模式),shovel(远程模式)
  • Java算法:二进制和位运算
  • redis的事务与管道有什么不同?
  • 闪存产品概述 NAND NOR FLASH
  • Redis——常用数据类型string
  • 【网络通信基础与实践第二讲】包括互联网概述、互联网发展的三个阶段、互联网的组成、计算机网络的体系结构
  • python 连接 oracle 报错
  • 2024 年 GitLab Global DevSecOps 报告解读