尚硅谷爬虫note005
一、编解码
1.get请求的quote方法
将汉字转为Unicode字符
# _*_ coding : utf-8 _*_
# @Time : 2025/2/12 16:33
# @Author : 20250206-里奥
# @File : demo19_get请求的quote方法
# @Project : PythonProject10-14
# 景甜page
# https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd=%E6%99%AF%E7%94%9C
#13用
# User-Agent
# Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0
#导入request
import urllib.request
from wsgiref.util import request_uri
#导入urllib.parse
import urllib.parse
from demo14_urllib import response
#2.字典:将网页源代码伪装成浏览器
headers ={
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0"
}
# 1.获取网页源码
url = "https://www.baidu.com/s?ie=utf-8&f=8&rsv_bp=1&rsv_idx=1&tn=baidu&wd="
#6.将汉字变成Unicode编码格式
#需要依赖于urllib下的parse,将景甜两个汉字转为编码格式
name = urllib.parse.quote('景甜')
#7.此时的url = url + name
url = url + name
print(url)
# print(name)
# #3.模拟浏览器向服务器发送请求
# #3-2.urlopen中不允许传递字典形式的数据
# #4.请求对象的定制,需要指定关键字传参(参数顺序原因)
# request = urllib.request.Request(url = url,headers=headers)
# response = urllib.request.urlopen(request)
# #5.获取响应内容
# content = response.read().decode("utf-8")
# print(response)
2.get请求的urlencode方法
# _*_ coding : utf-8 _*_
# @Time : 2025/2/13 08:53
# @Author : 20250206-里奥
# @File : demo20_url的encode方法
# @Project : PythonProject10-14
import urllib.parse
#
# url = "https://www.baidu.com/s?wd=景甜&sex=女"
# # url的encode方法,参数以字典方式存在
# data = {
# "wd":"景甜",
# "sex":"女"
# }
#
# #将urlencode传递进去
# a = urllib.parse.urlencode(data)
# print(a)
# 导入
import urllib.request
import urllib.parse
from demo14_urllib import response
basic_url = "https://www.baidu.com/s?"
data = {
"wd":"景甜",
"sex":"女"
}
new_data = urllib.parse.urlencode(data)
print(new_data)
# 请求资源路径
url = basic_url + new_data
# User-Agent
# Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0
headers ={
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:135.0) Gecko/20100101 Firefox/135.0"
}
# 请求对象的定制
request = urllib.request.Request(url = url,headers=headers)
#模拟浏览器向浏览器发送请求
response = urllib.request.urlopen(request)
# 获取网页源码
content = response.read().decode("utf-8")
#打印数据
print(content)
3. post请求方式
# _*_ coding : utf-8 _*_
# @Time : 2025/2/13 10:27
# @Author : 20250206-里奥
# @File : demo21_get的post请求方法
# @Project : PythonProject10-14
# 导入
import urllib.request
import urllib.parse
from idlelib.rpc import response_queue
from demo17_qingqiuduixaingdedingzhi import request, content
# post请求
url = "https://fanyi.baidu.com/sug"
# 请求头
headers = {
"user-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36"
}
# 参数
data = {
"kw":"hello"
}
#post请求的参数,必须进行编码
#该data是字符串类型,需要进行编码
# data = urllib.parse.urlencode(data)
#data请求的参数,必须进行编码
data = urllib.parse.urlencode(data).encode("utf-8")
#post请求的参数不会拼接在url后面,需要放在请求对象的定制中
request = urllib.request.Request(url = url, data=data, headers=headers)
# 模拟浏览器向服务器发送请求
response = urllib.request.urlopen(request)
# 获取数据
content = response.read().decode("utf-8")
#打印数据
print(content)
# print(type(content))
# 将字符串类型转为json对象
import json
obj = json.loads(content)
print(obj)
#psot请求特点
# post请求方式参数,必须编码
# 编码之后,必须调用encode()方法
# 参数放在请求对象定制的方法中