Python爬虫案例四:爬取某个博主的所有文章保存成PDF格式
引入(将图片保存成PDF格式):
测试链接: https://zq.bookan.com.cn/?t=detail&id=21088&ct=1&is=31042341&rid=4658(图书馆图片保存PDF),前提是装库,pip install img2pdf 具体步骤: import requests, img2pdf url_list = [ 'http://img1-qn.bookan.com.cn/page8/3234/3234-310411164/4214b5ac_big.mg', 'http://img1-qn.bookan.com.cn/page8/8314/8314-310441286/8522073f_big.mg' ] data_list = [requests.get(url).content for url in url_list] # 1、准备宽度 + 高度 width = img2pdf.mm_to_pt(300) height = img2pdf.mm_to_pt(300) # 2、准备空白的PDF页面 pdf_size = img2pdf.get_layout_fun((width, height)) // 版图 布局,此时PDF是空的 # 3、添加数据 (img的数据, pdf的size) pdf_data = img2pdf.convert(data_list, layout_fun=pdf_size) # 4、保存 with open('测试.pdf', 'wb') as f: f.write(pdf_data) print('ok') 测试结果:
案例实战: 抓取CSDN某位博主的文章并将其保存成PDF格式(先抓取一篇然后批量)
源码:
# 爬虫部分 ====> 代码可复用 import requests,parsel, pdfkit from lxml import etree def get_all(): # 批量下载博主的全部文章 ==> 取列表页获取所有的url info_url = 'https://blog.csdn.net/2301_80014606' cookies = { 'uuid_tt_dd': '10_10174532770-1711812940092-685570', 'Hm_up_6bcd52f51e9b3dce32bec4a3997715ac': '%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D', 'cf_clearance': 'LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw', 'UserName': 'm0_74614835', 'UserInfo': '8268477d59434a4194615c67cd9ea26d', 'UserToken': '8268477d59434a4194615c67cd9ea26d', 'UserNick': 'm0_74614835', 'AU': '23E', 'UN': 'm0_74614835', 'BT': '1718679492185', 'p_uid': 'U010000', 'm0_74614835comment_new': '1720492338243', 'firstDie': '1', 'Hm_lvt_ec8a58cd84a81850bcbd95ef89524721': '1720707980,1721100768,1721197835,1721446756', 'c_dl_fref': 'https://www.iteye.com/', 'c_utm_source': 'iteye', 'c_dl_prid': '1721446933332_253428', 'c_dl_rid': '1721446989577_111570', 'c_dl_fpage': '/download/Programmer_FuQiang/12187394', 'c_dl_um': 'distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a', 'c_segment': '2', 'Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721100769,1721197835,1721446756,1721521897', 'HMACCOUNT': '66A8254591DC78E3', 'https_waf_cookie': '50bed8ef-937c-4dd73e2026e8fe2a1adf6adc04dac5006194', 'dc_sid': '167e0952bfbaec891be59557bcfa7617', '_clck': '16zlski%7C2%7Cfnn%7C0%7C1550', 'csrfToken': 'SmchSM7MO-KHW5qJeFuboz29', '__gads': 'ID=32681eea0b3f2dc4:T=1711812942:RT=1721525715:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w', '__gpi': 'UID=00000d7858e98a50:T=1711812942:RT=1721525715:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg', '__eoi': 'ID=854a41bc034210ac:T=1711812942:RT=1721525715:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q', 'c_first_ref': 'default', 'c_utm_medium': 'distribute.pc_feed_blog_category.none-task-blog-classify_tag-3-139705227-null-null.nonecase', 'dc_session_id': '10_1721527944582.357468', 'c_pref': 'default', 'creativeSetApiNew': '%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D', 'log_Id_click': '144', 'waf_captcha_marker': '2ad0ccbd0b616ff0b7f0ead6dcb53d42a740df830a745a7366d633b88332b078', 'c_ref': 'default', 'c_first_page': 'https%3A//blog.csdn.net/2301_80014606%3Ftype%3Dblog', 'c_dsid': '11_1721528107847.945530', 'c_page_id': 'default', 'log_Id_pv': '111', 'Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721528111', 'log_Id_view': '3535', '_clsk': '112h5bx%7C1721528118664%7C1%7C0%7Cy.clarity.ms%2Fcollect', 'dc_tos': 'sgybm1', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', # 'Cookie': 'uuid_tt_dd=10_10174532770-1711812940092-685570; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D; cf_clearance=LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw; UserName=m0_74614835; UserInfo=8268477d59434a4194615c67cd9ea26d; UserToken=8268477d59434a4194615c67cd9ea26d; UserNick=m0_74614835; AU=23E; UN=m0_74614835; BT=1718679492185; p_uid=U010000; m0_74614835comment_new=1720492338243; firstDie=1; Hm_lvt_ec8a58cd84a81850bcbd95ef89524721=1720707980,1721100768,1721197835,1721446756; c_dl_fref=https://www.iteye.com/; c_utm_source=iteye; c_dl_prid=1721446933332_253428; c_dl_rid=1721446989577_111570; c_dl_fpage=/download/Programmer_FuQiang/12187394; c_dl_um=distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a; c_segment=2; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1721100769,1721197835,1721446756,1721521897; HMACCOUNT=66A8254591DC78E3; https_waf_cookie=50bed8ef-937c-4dd73e2026e8fe2a1adf6adc04dac5006194; dc_sid=167e0952bfbaec891be59557bcfa7617; _clck=16zlski%7C2%7Cfnn%7C0%7C1550; csrfToken=SmchSM7MO-KHW5qJeFuboz29; __gads=ID=32681eea0b3f2dc4:T=1711812942:RT=1721525715:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w; __gpi=UID=00000d7858e98a50:T=1711812942:RT=1721525715:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg; __eoi=ID=854a41bc034210ac:T=1711812942:RT=1721525715:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q; c_first_ref=default; c_utm_medium=distribute.pc_feed_blog_category.none-task-blog-classify_tag-3-139705227-null-null.nonecase; dc_session_id=10_1721527944582.357468; c_pref=default; creativeSetApiNew=%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D; log_Id_click=144; waf_captcha_marker=2ad0ccbd0b616ff0b7f0ead6dcb53d42a740df830a745a7366d633b88332b078; c_ref=default; c_first_page=https%3A//blog.csdn.net/2301_80014606%3Ftype%3Dblog; c_dsid=11_1721528107847.945530; c_page_id=default; log_Id_pv=111; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1721528111; log_Id_view=3535; _clsk=112h5bx%7C1721528118664%7C1%7C0%7Cy.clarity.ms%2Fcollect; dc_tos=sgybm1', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } params = { 'type': 'blog', } response = requests.get(info_url, params=params, cookies=cookies, headers=headers).text A = etree.HTML(response) url_list = A.xpath('//article[@class="blog-list-box"]/a/@href') for url in url_list: get_one(url) def get_one(url): # 下载博客的某一个文章 cookies = { 'uuid_tt_dd': '10_10174532770-1711812940092-685570', 'Hm_up_6bcd52f51e9b3dce32bec4a3997715ac': '%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D', 'cf_clearance': 'LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw', 'UserName': 'm0_74614835', 'UserInfo': '8268477d59434a4194615c67cd9ea26d', 'UserToken': '8268477d59434a4194615c67cd9ea26d', 'UserNick': 'm0_74614835', 'AU': '23E', 'UN': 'm0_74614835', 'BT': '1718679492185', 'p_uid': 'U010000', 'm0_74614835comment_new': '1720492338243', 'c_segment': '2', 'firstDie': '1', 'Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac': '1720707980,1721100769,1721197835,1721446756', 'HMACCOUNT': '66A8254591DC78E3', 'Hm_lvt_ec8a58cd84a81850bcbd95ef89524721': '1720707980,1721100768,1721197835,1721446756', 'Hm_lpvt_ec8a58cd84a81850bcbd95ef89524721': '1721446756', 'dc_sid': '37a800e065792e9d25758f94942b724b', 'c_dl_fref': 'https://www.iteye.com/', 'c_utm_source': 'iteye', '_clck': '16zlski%7C2%7Cfnm%7C0%7C1550', 'c_dl_prid': '1721446933332_253428', 'c_dl_rid': '1721446989577_111570', 'c_dl_fpage': '/download/Programmer_FuQiang/12187394', 'c_dl_um': 'distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a', 'dc_session_id': '10_1721489180573.741111', 'c_first_ref': 'www.baidu.com', 'c_first_page': 'https%3A//www.csdn.net/', 'c_dsid': '11_1721489179823.492641', 'creativeSetApiNew': '%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D', 'https_waf_cookie': 'b0c0b344-8c52-498bc5a67d7393bbce036bff877d4003edec', 'log_Id_click': '131', 'c_pref': 'https%3A//www.csdn.net/', 'c_ref': 'https%3A//i.csdn.net/', 'c_page_id': 'default', 'log_Id_pv': '93', 'Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721489207', '__gads': 'ID=32681eea0b3f2dc4:T=1711812942:RT=1721489211:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w', '__gpi': 'UID=00000d7858e98a50:T=1711812942:RT=1721489211:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg', '__eoi': 'ID=854a41bc034210ac:T=1711812942:RT=1721489211:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q', '_clsk': '14eo8p7%7C1721489210658%7C1%7C0%7Cx.clarity.ms%2Fcollect', 'log_Id_view': '2833', 'waf_captcha_marker': '9c2b63af339e16b26460039c73324b937b80bb04c7836dd3ef90ded0e605b8fe', 'dc_tos': 'sgxhl0', } headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', 'Accept-Language': 'zh-CN,zh;q=0.9', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', # 'Cookie': 'uuid_tt_dd=10_10174532770-1711812940092-685570; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D; cf_clearance=LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw; UserName=m0_74614835; UserInfo=8268477d59434a4194615c67cd9ea26d; UserToken=8268477d59434a4194615c67cd9ea26d; UserNick=m0_74614835; AU=23E; UN=m0_74614835; BT=1718679492185; p_uid=U010000; m0_74614835comment_new=1720492338243; c_segment=2; firstDie=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1720707980,1721100769,1721197835,1721446756; HMACCOUNT=66A8254591DC78E3; Hm_lvt_ec8a58cd84a81850bcbd95ef89524721=1720707980,1721100768,1721197835,1721446756; Hm_lpvt_ec8a58cd84a81850bcbd95ef89524721=1721446756; dc_sid=37a800e065792e9d25758f94942b724b; c_dl_fref=https://www.iteye.com/; c_utm_source=iteye; _clck=16zlski%7C2%7Cfnm%7C0%7C1550; c_dl_prid=1721446933332_253428; c_dl_rid=1721446989577_111570; c_dl_fpage=/download/Programmer_FuQiang/12187394; c_dl_um=distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a; dc_session_id=10_1721489180573.741111; c_first_ref=www.baidu.com; c_first_page=https%3A//www.csdn.net/; c_dsid=11_1721489179823.492641; creativeSetApiNew=%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D; https_waf_cookie=b0c0b344-8c52-498bc5a67d7393bbce036bff877d4003edec; log_Id_click=131; c_pref=https%3A//www.csdn.net/; c_ref=https%3A//i.csdn.net/; c_page_id=default; log_Id_pv=93; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1721489207; __gads=ID=32681eea0b3f2dc4:T=1711812942:RT=1721489211:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w; __gpi=UID=00000d7858e98a50:T=1711812942:RT=1721489211:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg; __eoi=ID=854a41bc034210ac:T=1711812942:RT=1721489211:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q; _clsk=14eo8p7%7C1721489210658%7C1%7C0%7Cx.clarity.ms%2Fcollect; log_Id_view=2833; waf_captcha_marker=9c2b63af339e16b26460039c73324b937b80bb04c7836dd3ef90ded0e605b8fe; dc_tos=sgxhl0', 'Sec-Fetch-Dest': 'document', 'Sec-Fetch-Mode': 'navigate', 'Sec-Fetch-Site': 'none', 'Sec-Fetch-User': '?1', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36', 'sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', } response = requests.get(url, cookies=cookies, headers=headers).text # print(response) # 将抓取到的文本单独保存成html文件然后进行解析 # ------xpath取标题----------- A = etree.HTML(response) title = A.xpath('//h1/text()')[0] # 标题是h1标签,不一定全是title标签 # --------css取正文----------- B = parsel.Selector(response) data = B.css('#content_views').get() html_data = \ ''' <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>Title</title> </head> <body> {} </body> </html> '''.format(data) with open('csdn正文.html', 'w', encoding='utf-8') as f: f.writelines(html_data) To_pdf(title) def To_pdf(title): # 转化PDF 引入工具 -- 进行转换 kit = pdfkit.configuration(wkhtmltopdf=r'E:\wkhtmltopdf\bin\wkhtmltopdf.exe') pdfkit.from_file('csdn正文.html', f'{title}.pdf', configuration=kit) print('保存OK--{}'.format(title)) def main(): get_all() if __name__ == '__main__': main()
运行效果(只列举其中一篇文章):