基于百度翻译的python爬虫示例
(今年java工作真难找啊,有广州java高级岗位招人的好心人麻烦推一下,拜谢。。)
花了一周时间,从零基础开始学习了python,学有所获之后,就总想爬些什么,不然感觉不得劲,所以花了一天时间整出了个百度翻译的爬虫示例,主要卡点花在了找token、sign以及调试请求上。代码有点乱,毕竟是demo,但是功能是实现了的。
import requests
import js2py
import re
from urllib.parse import urlencode
url = "https://fanyi.baidu.com/#zh/en/"
session = requests.session()
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
}
cookies = {
'BAIDUID': '624820D8D9163F370A491E7CA70C23D4:SL=0:NR=10:FG=1',
}
response = session.get(url,headers=headers,cookies=cookies)
print(dict(response.cookies))
with open('baidu.html', 'w') as f:
f.write(response.content.decode())
token_pattern = r"token:\s*'([a-f0-9]+)'"
token = re.search(token_pattern, response.content.decode()).group(1)
gtk_pattern = "gtk:\s*'([^']+)'"
gtk = re.search(gtk_pattern, response.content.decode()).group(1)
print(token)
print(gtk)
# 获取sign
context = js2py.EvalJs()
public_js = ""
with open('public.js', 'r') as f:
public_js += f.read()
context.execute(public_js)
context.wd = '好好学习,天天向上'
context.token = token
context.gtk = gtk
sug_response = session.post("https://fanyi.baidu.com/sug", data={'kw': context.wd}, headers=headers)
print(sug_response.json())
context.execute("""
function n(r, o) {
for (var t = 0; t < o.length - 2; t += 3) {
var e = o.charAt(t + 2);
e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
}
return r
}
function a(r) {
var a = r.length;
a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
var l = void 0
, d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
l = gtk;
for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
var p = r.charCodeAt(F);
128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
c[v++] = p >> 18 | 240,
c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
c[v++] = p >> 6 & 63 | 128),
c[v++] = 63 & p | 128)
}
for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
w += c[D],
w = n(w, A);
return w = n(w, b),
w ^= s,
0 > w && (w = (2147483647 & w) + 2147483648),
w %= 1e6,
w.toString() + "." + (w ^ S)
}
var sign = a(wd)
""")
print(context.sign)
url = 'https://fanyi.baidu.com/basetrans'
data = {
"query": context.wd,
"from": "zh",
"to": "en",
"token": token,
"sign": context.sign
}
encoded_data = urlencode(data)
print(cookies)
print(encoded_data)
headers = {
'Content-Type': 'application/x-www-form-urlencoded',
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
}
# session请求会更改user-agent {'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
print(session.headers)
response = requests.post(url, headers = headers,cookies=cookies,data=data, verify=True)
print(response.json())
"""
wd=全家的执行结果:
{}
3d7980a56760ca30e97aeeeda8e8fc6d
320305.131321201
{'errno': 0, 'data': [{'k': '全家福', 'v': '(全家合影) a photograph of the whole family; (中餐菜名) ho'}, {'k': '全家团聚', 'v': '动. whole family gather'}], 'logid': 2318810217}
681757.951340
{'BAIDUID': '624820D8D9163F370A491E7CA70C23D4:SL=0:NR=10:FG=1'}
query=%E5%85%A8%E5%AE%B6&from=zh&to=en&token=3d7980a56760ca30e97aeeeda8e8fc6d&sign=681757.951340
{'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
{'errno': 0, 'from': 'zh', 'to': 'en', 'trans': [{'dst': 'whole family', 'prefixWrap': 0, 'result': [[0, 'whole family', ['0|6'], [], ['0|6'], ['0|12']]], 'src': '全家'}], 'dict': {'symbols': [{'word_symbol': 'quán jiā', 'parts': [{'part_name': '名', 'means': [{'text': 'the whole family', 'word_mean': 'the whole family'}]}]}], 'word_name': '全家', 'from': 'green', 'word_means': ['the whole family']}, 'keywords': []}
"""
最新版本python3.13不支持js2py模块,所以我切换到了3.8版本