当前位置：首页 > article >正文

基于百度翻译的python爬虫示例

article 2025/3/20 8:26:09

(今年java工作真难找啊，有广州java高级岗位招人的好心人麻烦推一下，拜谢。。）

花了一周时间，从零基础开始学习了python，学有所获之后，就总想爬些什么，不然感觉不得劲，所以花了一天时间整出了个百度翻译的爬虫示例，主要卡点花在了找token、sign以及调试请求上。代码有点乱，毕竟是demo，但是功能是实现了的。

import requests
import js2py
import re
from urllib.parse import urlencode

url = "https://fanyi.baidu.com/#zh/en/"
session  = requests.session()
headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
}

cookies = {
    'BAIDUID': '624820D8D9163F370A491E7CA70C23D4:SL=0:NR=10:FG=1',
}

response = session.get(url,headers=headers,cookies=cookies)

print(dict(response.cookies))

with open('baidu.html', 'w') as f:
    f.write(response.content.decode())

token_pattern = r"token:\s*'([a-f0-9]+)'"
token = re.search(token_pattern, response.content.decode()).group(1)

gtk_pattern = "gtk:\s*'([^']+)'"
gtk = re.search(gtk_pattern, response.content.decode()).group(1)

print(token)
print(gtk)



# 获取sign
context = js2py.EvalJs()
public_js = ""
with open('public.js', 'r') as f:
    public_js += f.read()
context.execute(public_js)
context.wd = '好好学习，天天向上'
context.token = token
context.gtk = gtk

sug_response = session.post("https://fanyi.baidu.com/sug", data={'kw': context.wd}, headers=headers)
print(sug_response.json())

context.execute("""
     function n(r, o) {
        for (var t = 0; t < o.length - 2; t += 3) {
            var e = o.charAt(t + 2);
            e = e >= "a" ? e.charCodeAt(0) - 87 : Number(e),
            e = "+" === o.charAt(t + 1) ? r >>> e : r << e,
            r = "+" === o.charAt(t) ? r + e & 4294967295 : r ^ e
        }
        return r
    }

     function a(r) {
   
        var a = r.length;
        a > 30 && (r = "" + r.substr(0, 10) + r.substr(Math.floor(a / 2) - 5, 10) + r.substr(-10, 10))
       
        var l = void 0
          , d = "" + String.fromCharCode(103) + String.fromCharCode(116) + String.fromCharCode(107);
        l = gtk;
        for (var m = l.split("."), S = Number(m[0]) || 0, s = Number(m[1]) || 0, c = [], v = 0, F = 0; F < r.length; F++) {
            var p = r.charCodeAt(F);
            128 > p ? c[v++] = p : (2048 > p ? c[v++] = p >> 6 | 192 : (55296 === (64512 & p) && F + 1 < r.length && 56320 === (64512 & r.charCodeAt(F + 1)) ? (p = 65536 + ((1023 & p) << 10) + (1023 & r.charCodeAt(++F)),
            c[v++] = p >> 18 | 240,
            c[v++] = p >> 12 & 63 | 128) : c[v++] = p >> 12 | 224,
            c[v++] = p >> 6 & 63 | 128),
            c[v++] = 63 & p | 128)
        }
        for (var w = S, A = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(97) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(54)), b = "" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(51) + ("" + String.fromCharCode(94) + String.fromCharCode(43) + String.fromCharCode(98)) + ("" + String.fromCharCode(43) + String.fromCharCode(45) + String.fromCharCode(102)), D = 0; D < c.length; D++)
            w += c[D],
            w = n(w, A);
        return w = n(w, b),
        w ^= s,
        0 > w && (w = (2147483647 & w) + 2147483648),
        w %= 1e6,
        w.toString() + "." + (w ^ S)
    }

    var sign = a(wd)
""")

print(context.sign)


url = 'https://fanyi.baidu.com/basetrans'
data = {
   "query": context.wd,
   "from": "zh",
   "to": "en",
   "token": token,
   "sign": context.sign
}
encoded_data = urlencode(data)
print(cookies)
print(encoded_data)
headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 16_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/16.6 Mobile/15E148 Safari/604.1',
}
# session请求会更改user-agent {'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
print(session.headers)
response = requests.post(url, headers = headers,cookies=cookies,data=data, verify=True)
print(response.json())


"""
wd=全家的执行结果：
{}
3d7980a56760ca30e97aeeeda8e8fc6d
320305.131321201
{'errno': 0, 'data': [{'k': '全家福', 'v': '（全家合影） a photograph of the whole family; （中餐菜名） ho'}, {'k': '全家团聚', 'v': '动. whole family gather'}], 'logid': 2318810217}
681757.951340
{'BAIDUID': '624820D8D9163F370A491E7CA70C23D4:SL=0:NR=10:FG=1'}
query=%E5%85%A8%E5%AE%B6&from=zh&to=en&token=3d7980a56760ca30e97aeeeda8e8fc6d&sign=681757.951340
{'User-Agent': 'python-requests/2.32.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': '*/*', 'Connection': 'keep-alive'}
{'errno': 0, 'from': 'zh', 'to': 'en', 'trans': [{'dst': 'whole family', 'prefixWrap': 0, 'result': [[0, 'whole family', ['0|6'], [], ['0|6'], ['0|12']]], 'src': '全家'}], 'dict': {'symbols': [{'word_symbol': 'quán jiā', 'parts': [{'part_name': '名', 'means': [{'text': 'the whole family', 'word_mean': 'the whole family'}]}]}], 'word_name': '全家', 'from': 'green', 'word_means': ['the whole family']}, 'keywords': []}


"""

最新版本python3.13不支持js2py模块，所以我切换到了3.8版本