初识爬虫3
1.cookies参数(浏览器的登录信息,需要设置,防止反爬机制检测)
1.1 headers中设置cookies参数
# -*- coding: utf-8 -*-
import requests
url = 'https://github.com/'
# 构建请求字典
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
'Cookie': '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
}
response = requests.get(url, headers=headers)
with open('guitub_1.html', 'wb') as f:
f.write(response.content)
1.2 构建cookies字典
# -*- coding: utf-8 -*-
import requests
url = 'https://github.com/'
# 构建请求字典
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
# 'Cookie': '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
}
# 构建cookies字典
temp = '_octo=GH1.1.186456086.1726021874; preferred_color_mode=light; tz=Asia%2FShanghai; _device_id=30702b425eb631902645d34d468c44f3; saved_user_sessions=169171441%3A9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; user_session=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; __Host-user_session_same_site=9O71jCj_OP43YLGcjwEAZyoiu7MFkEV8-8uziErITUz9UtRk; tz=Asia%2FShanghai; color_mode=%7B%22color_mode%22%3A%22auto%22%2C%22light_theme%22%3A%7B%22name%22%3A%22light%22%2C%22color_mode%22%3A%22light%22%7D%2C%22dark_theme%22%3A%7B%22name%22%3A%22dark%22%2C%22color_mode%22%3A%22dark%22%7D%7D; logged_in=yes; dotcom_user=csqting; _gh_sess=37D6rCRVC2JibASPc1N6MTQ%2Fj978CXDTiDCv7QjSMLlWKqGGa1smeK%2B8xZ4fX1b%2BUZtSsFdupYGhCsbx4k7KkHrg834mEVQOzYg4fi26fzkCaB96MaPNs9DbPtwWqgEBgYCZHmqkaAgZTwF8Z6yMyeZofc%2FgqXRPl04gNfdkxHlKImQTfHwem%2Bi2jZIblkreQLuMR04D%2B41HT4jnaNph6ceYbeXx7tb%2BOCRYttZfKgwh6GIUenNsRu39u7jcrrRZQpSneIqSFofDe%2FoYuiOxRZdPPXivPlnEcgRwCEybob9TKkjGWW2KKiV1EaeMDlrb2ecgvBuG2nmjegsAFcwdoN%2FeVMZ%2Bhrk426b6uGsj%2FLK%2Bs70JeUslEcM65VTcEi7vd7cCWxN%2Bx12YQOn0KQ%2BDUzbix%2FCbZolCWWQs0%2Fw5GrFO2XXs98zPwQut%2FuUV2KGu5%2FXBpI7rUIQrrP063I8izSFVbYUJ51poKdpjmwg0O6VZXhixotftkPNXHO2NCJrzCV1IK9TVrjHeYQYalPokpTtINwh6TGuDmfJLdZnCDr%2F7CeHs6WaQMJ%2Bz4UboIsBUuQjkPHycBDNpQ3%2B3unD1SAQblWXdsW4IQJhgWWkojyfk70iONTDjoTNsFL2UGcMjDbgPAT70OCzyRCV7ZjK6lAeTmtKOkfxECEnfXJz%2FThhMiQUPm49cT4qxaAgkvoqRCNjb8o2l8Q1ZI%2B%2B83QhKZmOxmFDB%2BmVtVlCG%2FdRtbVbvNtOwYdiBQ2hwlYUoiZVw89t0fPTfzMnwneF3OIAyTq4n7ugooJqkCKKUpMWFT4PzReb59xnSfVNlROKm6B%2BIoNEkFnADWLbDMoh0jyRPT6Kzo6GZDXGPCn5Pdbj1QrtCa4I8thaHFw%3D%3D--TLPPQIkHZvUr%2FeNc--B%2FBP6cHV2VuqyNvzo7JmXQ%3D%3D'
# 方案一
# cookies_list = temp.split('; ')
# cookies = {}
# for cookie in cookies_list:
# cookies[cookie.split('=')[0]] = cookie.split('=')[-1]
#
# response = requests.get(url, headers=headers, cookies=cookies)
#
#
# with open('guitub_2.html', 'wb') as f:
# f.write(response.content)
# 方案二(工作中用)
cookies_list = temp.split('; ')
cookies = {cookie.split('=')[0] : cookie.split('=')[1] for cookie in cookies_list}
print(cookies)
response = requests.get(url, headers=headers, cookies=cookies)
# with open('guitub_3.html', 'wb') as f:
# f.write(response.content)
2.cookiejar的处理(存储和管理 cookie)
# -*- coding: utf-8 -*-
import requests
url = 'https://www.baidu.com'
response = requests.get(url)
print(response.cookies)
# 对cookiejar对象的处理,将cookiejar转换为字典:
dict_cookies = requests.utils.dict_from_cookiejar(response.cookies)
print(dict_cookies)
# 将字典转换回cookiejar
jar_cookies = requests.utils.cookiejar_from_dict(dict_cookies)
print(jar_cookies)
3. 模拟网络波动,timeout的使用
# -*- coding: utf-8 -*-
import requests
url = 'https://twitter.com'
response = requests.get(url, timeout=3)
print(response.cookies)