当前位置: 首页 > article >正文

爬取b站评论

本博客旨在分享关于爬虫技术的学习和实践经验,仅供学习使用,请使用爬虫技术的用户自行承担相应的法律责任,务必在进行任何网络数据抓取操作之前,仔细审查相关法律法规,并取得相应的授权或同意。请确保你的行为符合道德和法律的双重标准,尊重知识产权和网站的服务协议,仅将此技术应用于正当、合法的学习和研究目的。

爬取b站评论

  • 1.单视频评论
  • 2.单视频弹幕

1.单视频评论

爬取某个视频的评论时,只需要对请求载荷的w_ridwts进行加密,然后请求网络拿到数据,拿到的数据不需要解密。
这里的加密我直接扣js代码破解,比较简单,首先在comment_url.js里面写加密函数。这里有一个参数是从本地的localstorage里面拿到的,这里我直接写死了,但其实也没用,因为o和i都是常量。

function lt(e) {
    ct = "wbi_img_urls";
    var t, r, n = function(e) {
        var t;
        if (e.useAssignKey)
            return {
                imgKey: e.wbiImgKey,
                subKey: e.wbiSubKey
            };
        var r = (null === (t = function(e) {
            try {
                return "https://i0.hdslb.com/bfs/wbi/7cd084941338484aae1ad9425b84077c.png-https://i0.hdslb.com/bfs/wbi/4932caff0ff746eab6f01bf08b70ac45.png"
            } catch (e) {
                return null
            }
        }(ct)) || void 0 === t ? void 0 : t.split("-")) || []
          , n = r[0]
          , o = r[1]
          , i = n ? ft(n) : e.wbiImgKey
          , a = o ? ft(o) : e.wbiSubKey;
        return {
            imgKey: i,
            subKey: a
        }
    }(arguments.length > 1 && void 0 !== arguments[1] ? arguments[1] : {
        wbiImgKey: "",
        wbiSubKey: ""
    }), o = n.imgKey, i = n.subKey;
    // ,o = '7cd084941338484aae1ad9425b84077c', i = '4932caff0ff746eab6f01bf08b70ac45';
    if (o && i) {
        for (var a = (t = o + i,
        r = [],
        [46, 47, 18, 2, 53, 8, 23, 32, 15, 50, 10, 31, 58, 3, 45, 35, 27, 43, 5, 49, 33, 9, 42, 19, 29, 28, 14, 39, 12, 38, 41, 13, 37, 48, 7, 16, 24, 55, 40, 61, 26, 17, 0, 1, 60, 51, 30, 4, 22, 25, 54, 21, 56, 59, 6, 63, 57, 62, 11, 36, 20, 34, 44, 52].forEach((function(e) {
            t.charAt(e) && r.push(t.charAt(e))
        }
        )),
        r.join("").slice(0, 32)), u = Math.round(Date.now() / 1e3), s = Object.assign({}, e, {
            wts: u
        }), c = Object.keys(s).sort(), l = [], f = /[!'()*]/g, d = 0; d < c.length; d++) {
            var p = c[d]
              , h = s[p];
            h && "string" == typeof h && (h = h.replace(f, "")),
            null != h && l.push("".concat(encodeURIComponent(p), "=").concat(encodeURIComponent(h)))
        }
        var y = l.join("&");
        return {
            w_rid: at(y + a),
            wts: u.toString()
        }
    }
    return "sssss"
}
function ft(e) {
    return e.substring(e.lastIndexOf("/") + 1, e.length).split(".")[0]
}
r = function() {
    return e
}
EwordsToBytes=function(e) {
    console.log(e)
    for (var t = [], r = 0; r < 32 * e.length; r += 8)
        t.push(e[r >>> 5] >>> 24 - r % 32 & 255);
    return t
}
EbytesToWords=function(e) {
    for (var t = [], r = 0, n = 0; r < e.length; r++,
    n += 8)
        t[n >>> 5] |= e[r] << 24 - n % 32;
    return t
}
TstringToBytes=function(e) {
    return NstringToBytes(unescape(encodeURIComponent(e)))
},
TbytesToString=function(e) {
    return decodeURIComponent(escape(rt.bin.bytesToString(e)))
}
NstringToBytes= function(e) {
    for (var t = [], r = 0; r < e.length; r++)
        t.push(255 & e.charCodeAt(r));
    return t
},
NbytesToString=function(e) {
    for (var t = [], r = 0; r < e.length; r++)
        t.push(String.fromCharCode(e[r]));
    return t.join("")
}
function hFF(e, t, r, n, o, i, a) {
    var u = e + (t & r | ~t & n) + (o >>> 0) + a;
    return (u << i | u >>> 32 - i) + t
}
function yGG(e, t, r, n, o, i, a) {
    var u = e + (t & n | r & ~n) + (o >>> 0) + a;
    return (u << i | u >>> 32 - i) + t
}
function vHH(e, t, r, n, o, i, a) {
    var u = e + (t ^ r ^ n) + (o >>> 0) + a;
    return (u << i | u >>> 32 - i) + t
}
function bII(e, t, r, n, o, i, a) {
    var u = e + (r ^ (t | ~n)) + (o >>> 0) + a;
    return (u << i | u >>> 32 - i) + t
}
o = function o(i, a) {
    i.constructor == String ? i = a && "binary" === a.encoding ? NstringToBytes(i) : TstringToBytes(i) : r(i) ? i = Array.prototype.slice.call(i, 0) : Array.isArray(i) || i.constructor === Uint8Array || (i = i.toString());
    for (var u = EbytesToWords(i), s = 8 * i.length, c = 1732584193, l = -271733879, f = -1732584194, d = 271733878, p = 0; p < u.length; p++)
        u[p] = 16711935 & (u[p] << 8 | u[p] >>> 24) | 4278255360 & (u[p] << 24 | u[p] >>> 8);
    u[s >>> 5] |= 128 << s % 32,
    u[14 + (s + 64 >>> 9 << 4)] = s;
    var h = o._ff
      , y = o._gg
      , v = o._hh
      , b = o._ii;
    for (p = 0; p < u.length; p += 16) {
        var m = c
          , w = l
          , g = f
          , x = d;
        c = hFF(c, l, f, d, u[p + 0], 7, -680876936),
        d = hFF(d, c, l, f, u[p + 1], 12, -389564586),
        f = hFF(f, d, c, l, u[p + 2], 17, 606105819),
        l = hFF(l, f, d, c, u[p + 3], 22, -1044525330),
        c = hFF(c, l, f, d, u[p + 4], 7, -176418897),
        d = hFF(d, c, l, f, u[p + 5], 12, 1200080426),
        f = hFF(f, d, c, l, u[p + 6], 17, -1473231341),
        l = hFF(l, f, d, c, u[p + 7], 22, -45705983),
        c = hFF(c, l, f, d, u[p + 8], 7, 1770035416),
        d = hFF(d, c, l, f, u[p + 9], 12, -1958414417),
        f = hFF(f, d, c, l, u[p + 10], 17, -42063),
        l = hFF(l, f, d, c, u[p + 11], 22, -1990404162),
        c = hFF(c, l, f, d, u[p + 12], 7, 1804603682),
        d = hFF(d, c, l, f, u[p + 13], 12, -40341101),
        f = hFF(f, d, c, l, u[p + 14], 17, -1502002290),
        c = yGG(c, l = hFF(l, f, d, c, u[p + 15], 22, 1236535329), f, d, u[p + 1], 5, -165796510),
        d = yGG(d, c, l, f, u[p + 6], 9, -1069501632),
        f = yGG(f, d, c, l, u[p + 11], 14, 643717713),
        l = yGG(l, f, d, c, u[p + 0], 20, -373897302),
        c = yGG(c, l, f, d, u[p + 5], 5, -701558691),
        d = yGG(d, c, l, f, u[p + 10], 9, 38016083),
        f = yGG(f, d, c, l, u[p + 15], 14, -660478335),
        l = yGG(l, f, d, c, u[p + 4], 20, -405537848),
        c = yGG(c, l, f, d, u[p + 9], 5, 568446438),
        d = yGG(d, c, l, f, u[p + 14], 9, -1019803690),
        f = yGG(f, d, c, l, u[p + 3], 14, -187363961),
        l = yGG(l, f, d, c, u[p + 8], 20, 1163531501),
        c = yGG(c, l, f, d, u[p + 13], 5, -1444681467),
        d = yGG(d, c, l, f, u[p + 2], 9, -51403784),
        f = yGG(f, d, c, l, u[p + 7], 14, 1735328473),
        c = vHH(c, l = yGG(l, f, d, c, u[p + 12], 20, -1926607734), f, d, u[p + 5], 4, -378558),
        d = vHH(d, c, l, f, u[p + 8], 11, -2022574463),
        f = vHH(f, d, c, l, u[p + 11], 16, 1839030562),
        l = vHH(l, f, d, c, u[p + 14], 23, -35309556),
        c = vHH(c, l, f, d, u[p + 1], 4, -1530992060),
        d = vHH(d, c, l, f, u[p + 4], 11, 1272893353),
        f = vHH(f, d, c, l, u[p + 7], 16, -155497632),
        l = vHH(l, f, d, c, u[p + 10], 23, -1094730640),
        c = vHH(c, l, f, d, u[p + 13], 4, 681279174),
        d = vHH(d, c, l, f, u[p + 0], 11, -358537222),
        f = vHH(f, d, c, l, u[p + 3], 16, -722521979),
        l = vHH(l, f, d, c, u[p + 6], 23, 76029189),
        c = vHH(c, l, f, d, u[p + 9], 4, -640364487),
        d = vHH(d, c, l, f, u[p + 12], 11, -421815835),
        f = vHH(f, d, c, l, u[p + 15], 16, 530742520),
        c = bII(c, l = vHH(l, f, d, c, u[p + 2], 23, -995338651), f, d, u[p + 0], 6, -198630844),
        d = bII(d, c, l, f, u[p + 7], 10, 1126891415),
        f = bII(f, d, c, l, u[p + 14], 15, -1416354905),
        l = bII(l, f, d, c, u[p + 5], 21, -57434055),
        c = bII(c, l, f, d, u[p + 12], 6, 1700485571),
        d = bII(d, c, l, f, u[p + 3], 10, -1894986606),
        f = bII(f, d, c, l, u[p + 10], 15, -1051523),
        l = bII(l, f, d, c, u[p + 1], 21, -2054922799),
        c = bII(c, l, f, d, u[p + 8], 6, 1873313359),
        d = bII(d, c, l, f, u[p + 15], 10, -30611744),
        f = bII(f, d, c, l, u[p + 6], 15, -1560198380),
        l = bII(l, f, d, c, u[p + 13], 21, 1309151649),
        c = bII(c, l, f, d, u[p + 4], 6, -145523070),
        d = bII(d, c, l, f, u[p + 11], 10, -1120210379),
        f = bII(f, d, c, l, u[p + 2], 15, 718787259),
        l = bII(l, f, d, c, u[p + 9], 21, -343485551),
        c = c + m >>> 0,
        l = l + w >>> 0,
        f = f + g >>> 0,
        d = d + x >>> 0
    }
    return endian([c, l, f, d])
};
rotl=function(e, t) {
    return e << t | e >>> 32 - t
},
rotr=function(e, t) {
    return e << 32 - t | e >>> t
}
function endian(e) {
    if (e.constructor == Number)
        return 16711935 & rotl(e, 8) | 4278255360 & rotl(e, 24);
    for (var r = 0; r < e.length; r++)
        e[r] = endian(e[r]);
    return e
}
bytesToHex=function(e) {
    for (var t = [], r = 0; r < e.length; r++)
        t.push((e[r] >>> 4).toString(16)),
        t.push((15 & e[r]).toString(16));
    return t.join("")
}
var Qe= function(t, r) {
    if (null == t)
        throw new Error("Illegal argument " + t);
    var i = EwordsToBytes(o(t, r));
    return r && r.asBytes ? i : r && r.asString ? NbytesToString(i) : bytesToHex(i)
}
function Ze(e) {
    return e && e.__esModule && Object.prototype.hasOwnProperty.call(e, "default") ? e.default : e
}
var at = Ze(Qe)
// e={
//     "oid": "1906333968",
//     "type": 1,
//     "mode": 3,
//     "pagination_str": "{\"offset\":\"{\\\"type\\\":1,\\\"direction\\\":1,\\\"session_id\\\":\\\"1778169679258543\\\",\\\"data\\\":{}}\"}",
//     "plat": 1,
//     "web_location": 1315875
// }
e={
    "oid": "1906333968",
    "type": 1,
    "mode": 3,
    "pagination_str": "{\"offset\":\"\"}",
    "plat": 1,
    "seek_rpid": "",
    "web_location": 1315875
}
console.log(lt(e))

然后在py里面调用js文件,获得加密后的载荷数据,然后请求,并对数据进行分析和保存,这里我存成了csv文件。这里需要注意的是参数有的是字符串有的是数字,所以严格按照输出的格式,否则加密结果一致通不过验证,还有就是pagination_str的格式,一定要按照控制台的输出格式写,否则验证失败。评论懒加载,第一次页拿到sessionid后后续请求时会带着sessionid。

import requests
import urllib.parse
import csv
import execjs
oid="1906333968"#必须是string类型啊!!!
web_location= 1315875#必须是整数类型啊!!!
session_id=""#第一页无session_id
cookies = {
    'buvid3': '6C16A34E-4B78-F350-03AA-71E6B21A703519906infoc',
    'b_nut': '1726211919',
    '_uuid': '828DDCCD-F3CD-3997-11077-1729B6881A6120884infoc',
    'enable_web_push': 'DISABLE',
    'buvid4': '3CB58DB4-B2F0-07C1-06FA-E452949C4A8942274-024082300-j2Owk+KrE1E0oCXj+7DzqA%3D%3D',
    'header_theme_version': 'CLOSE',
    'rpdid': "|(u|kkmlu~ll0J'u~kYkukl|m",
    'fingerprint': '65fbd3ec7ea1fba4aa76eb96cb7f6249',
    'buvid_fp_plain': 'undefined',
    'buvid_fp': '65fbd3ec7ea1fba4aa76eb96cb7f6249',
    'DedeUserID': '37611353',
    'DedeUserID__ckMd5': 'af2f5320e5c29dea',
    'home_feed_column': '5',
    'browser_resolution': '2048-1023',
    'bili_ticket': 'eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzY0OTQzNTgsImlhdCI6MTczNjIzNTA5OCwicGx0IjotMX0.UA_DNnfYHwmuWf3mk3zAc45Ar6QrABl70LmFhjli-ms',
    'bili_ticket_expires': '1736494298',
    'SESSDATA': 'dcf4f3e6%2C1751950746%2C36331%2A12CjDsO7miIb_M9f1MQIIa7qIN5AucRW-WAnR_3eKJ6r4sPE3wgHTKNDZEFG6BeHrHqg4SVmd5YlUxTDM0NVdRQ3hHZHhNMFkyS0JQbjhvRWh2U0RTZElXVHFWSy1ZYkZVYzVHSlNhSWV4WUMxV0pRMHB1ZkV6TEFhd1RfaEZqVG90dUJvazNEUVV3IIEC',
    'bili_jct': '91812d98065f2f1035dfb5271f1057b6',
    'CURRENT_FNVAL': '4048',
    
    #TODO
    'sid': '6rzu47nf',#8'b_lsid': 'EF10A7B92_1944D9191B0',#位
    'bp_t_offset_37611353': '1020612344109072384',#位
    
}

headers = {
    'accept': '*/*',
    'accept-language': 'zh-CN,zh;q=0.9',
    'cache-control': 'no-cache',
    # 'cookie': "buvid3=6C16A34E-4B78-F350-03AA-71E6B21A703519906infoc; b_nut=1726211919; _uuid=828DDCCD-F3CD-3997-11077-1729B6881A6120884infoc; enable_web_push=DISABLE; buvid4=3CB58DB4-B2F0-07C1-06FA-E452949C4A8942274-024082300-j2Owk+KrE1E0oCXj+7DzqA%3D%3D; header_theme_version=CLOSE; rpdid=|(u|kkmlu~ll0J'u~kYkukl|m; fingerprint=65fbd3ec7ea1fba4aa76eb96cb7f6249; buvid_fp_plain=undefined; buvid_fp=65fbd3ec7ea1fba4aa76eb96cb7f6249; DedeUserID=37611353; DedeUserID__ckMd5=af2f5320e5c29dea; home_feed_column=5; browser_resolution=2048-1023; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MzY0OTQzNTgsImlhdCI6MTczNjIzNTA5OCwicGx0IjotMX0.UA_DNnfYHwmuWf3mk3zAc45Ar6QrABl70LmFhjli-ms; bili_ticket_expires=1736494298; SESSDATA=dcf4f3e6%2C1751950746%2C36331%2A12CjDsO7miIb_M9f1MQIIa7qIN5AucRW-WAnR_3eKJ6r4sPE3wgHTKNDZEFG6BeHrHqg4SVmd5YlUxTDM0NVdRQ3hHZHhNMFkyS0JQbjhvRWh2U0RTZElXVHFWSy1ZYkZVYzVHSlNhSWV4WUMxV0pRMHB1ZkV6TEFhd1RfaEZqVG90dUJvazNEUVV3IIEC; bili_jct=91812d98065f2f1035dfb5271f1057b6; CURRENT_FNVAL=4048; sid=6rzu47nf; b_lsid=EF10A7B92_1944D9191B0; bp_t_offset_37611353=1020612344109072384",
    'origin': 'https://www.bilibili.com',
    'pragma': 'no-cache',
    'priority': 'u=1, i',
    'referer': 'https://www.bilibili.com/video/BV1xU411U7PW/?spm_id_from=333.1391.0.0&vd_source=fd84ddc58aead0485969c92933b61484',
    'sec-ch-ua': '"Google Chrome";v="131", "Chromium";v="131", "Not_A Brand";v="24"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
    'sec-fetch-dest': 'empty',
    'sec-fetch-mode': 'cors',
    'sec-fetch-site': 'same-site',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36',
}


def save2csv(*args):
    if len(args) < 11:
        raise ValueError("参数错误.")
    with open(f"{args[10]}_{args[11]}.csv", "a", newline='',encoding="utf-8") as f:
        f_csv=csv.writer(f)#写入缓存
        if f.tell() == 0:  # 检查文件是否为空,如果是,则先写入表头
            headers_csv=['rpid','replay_count','message','like','avatar','sex','uname','oid','parent','is_end','all_count','name']
            f_csv.writerow(headers_csv)
        data = list(args)
        f_csv.writerow(data)#写入一行
def handle_content(list_comment,is_end,all_count,name):
    for comment in list_comment:
        rpid=comment["rpid"]#我的id
        count=comment["count"]
        replay_count=comment["rcount"]#回复数
        message=comment["content"]["message"]#回复内容
        like=comment["like"]#点赞数
        avatar=comment["member"]["avatar"]#回复者头像
        sex=comment["member"]["sex"]#回复者性别
        uname=comment["member"]["uname"]#回复者昵称
        oid=comment["oid"]#我以及我的回复者们共用id
        parent=comment["parent"]#回复者id
        if comment["replies"]:
            replies=handle_content(comment["replies"],is_end,all_count,name)
        # save2csv(rpid,replay_count,message,like,avatar,sex,uname,oid,parent,is_end,all_count,name)
        print(rpid,replay_count,message,like,avatar,sex,uname,oid,parent)
def handle_cursor(cursor):
    is_end=cursor["is_end"]#是否最后一页
    all_count=cursor["all_count"]#总评论数
    name=cursor["name"]#热门评论
    return is_end,all_count,name
def get_params(session_id):#{"offset":"{\"type\":1,\"direction\":1,\"session_id\":\"1778143604964054\",\"data\":{}}"}
    pagination_str = "{\"offset\":\"{\\\"type\\\":1,\\\"direction\\\":1,\\\"session_id\\\":\\\""+str(session_id)+"\\\",\\\"data\\\":{}}\"}" if session_id else '{\"offset\":\"\"}'
    params={
    "oid": oid,
    "type": 1,
    "mode": 3,
    "pagination_str": pagination_str,
    "plat": 1,
    'seek_rpid': '',
    "web_location": web_location
    }
    ctx=execjs.compile(open('./bili/comment_url.js','r',encoding='utf-8').read()).call('lt',params)
    params.update({
        'w_rid': ctx["w_rid"],
        'wts': ctx["wts"]
    })
    return params
if __name__=="__main__":
    count=1
    while True:
        params=get_params(session_id)
        print(params)
        response = requests.get(
        'https://api.bilibili.com/x/v2/reply/wbi/main',
        cookies=cookies,
        headers=headers,
        params=params,
    )
        # print(response.text)
        is_end,all_count,name=handle_cursor(response.json()["data"]["cursor"])
        handle_content(response.json()["data"]["replies"],is_end,all_count,name)
        print(f"第{count}页爬完了")
        count+=1
        if is_end==True:
            print(f"爬取完成,一共有{all_count}条")
            break

2.单视频弹幕


http://www.kler.cn/a/488687.html

相关文章:

  • CI/CD 流水线
  • 【Linux】模拟Shell命令行解释器
  • Jenkins-持续集成、交付、构建、部署、测试
  • [笔记] 使用 Jenkins 实现 CI/CD :从 GitLab 拉取 Java 项目并部署至 Windows Server
  • 点击底部的 tabBar 属于 wx.switchTab 跳转方式,目标页面的 onLoad 不会触发(除非是第一次加载)
  • 详细全面讲解C++中重载、隐藏、覆盖的区别
  • 智元机器人完成 1000 台通用具身机器人下线
  • 计算机毕业设计Python机器学习农作物健康识别系统 人工智能 图像识别 机器学习 大数据毕业设计 算法
  • Linux Snort检测
  • 工商银行devops流程一体化工具
  • uniapp结合movable-area与movable-view实现拖拽功能2
  • Hbuilder ios 离线打包sdk版本4.36,HbuilderX 4.36生成打包资源 问题记录
  • wireshark排除私接小路由
  • MT6835天玑6100平台规格参数_MTK联发科安卓核心板方案定制开发
  • 【MFC】设置CTreeCtrl单个节点的文字颜色
  • Jenkins git SSH获取code报错:git@github.com: Permission denied (publickey).
  • 计算机网络 (33)传输控制协议TCP概述
  • 【HTML+CSS+JS+VUE】web前端教程-18-css引入方式
  • 2025年第三届“华数杯”国际赛B题解题思路与代码(Matlab版)
  • 网络安全测评技术与标准
  • LeetCode:2274. 不含特殊楼层的最大连续楼层数(排序 Java)
  • smplx blender插件笔记
  • 甘蔗叶片图像元素含量的回归预测多模型实现【含私人数据集】
  • Windows 程序设计2:第一个Windows实例程序
  • 什么是数据湖?大数据架构的未来趋势
  • C++ 中的 template <typename T> 用法 ← 泛型