Python爬虫案例七:抓取南京公交信息数据并将其保存成excel多表形式
测试链接:
https://nanjing.8684.cn/line4
思路:先抓取某个类型下的某一条线路所有数据,然后实现批量,,列举出三个类型代表既可
源码:
from lxml import etree from xlutils.copy import copy import requests, os, xlrd, xlwt def get_all(): # 获取所有 tynm_list = ['江南线路(1-399)', '江北线路(400-699)', '江宁线路(700-999)'] tyid_list = [2, 3, 4] for tynm, tyid in zip(tynm_list, tyid_list): list_url = 'https://nanjing.8684.cn/line{}'.format(tyid) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Pragma": "no-cache", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "sec-ch-ua-mobile": "?0" } cookies = { "JSESSIONID": "48304F9E8D55A9F2F8ACC14B7EC5A02D", "wbf__voiceplg-is": "false", "tongue": "1", "Hm_lvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712659199", "__gads": "ID", "__gpi": "UID", "__eoi": "ID", "SECKEY_ABVK": "2DPSFBW+PxohRgE9br/PasK97Oo+bbbcKQgQu9uxadc%3D", "BMAP_SECKEY": "XCSGTS0HVG9MJBd7qjmcuIneKymhvMCOXLg1JoYhcHTYNyZi_ZD1PkQ8wHX0_ycxbyF1QTuQWF68O-J3hMNYeSVrLdplIVuNxTyW1OaKt18bXNTDHrBSmsZ7DEMwNaY3o1qfZ-Gy932UGgUlRkYaQLMujMyT2eGMlHUKElpXgb3WIdgV2i4dGkFfMutvbhUKyxkjaWZMOhimPI5uGe2Zow", "Hm_lpvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712671763" } response = requests.get(list_url, headers=headers, cookies=cookies).content.decode() # print(response) parse_all(response, tynm) def parse_all(response, tynm): # 解析所有的线路ID A = etree.HTML(response) a_list = A.xpath('//div[@class="list clearfix"]/a') for a in a_list: xlid = a.xpath('./@href')[0] get_one(xlid, tynm) def get_one(xlid, tynm): # 某一条线路 one_url = 'https://nanjing.8684.cn{}'.format(xlid) headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", "Accept-Language": "zh-CN,zh;q=0.9", "Cache-Control": "no-cache", "Connection": "keep-alive", "Pragma": "no-cache", "Sec-Fetch-Dest": "document", "Sec-Fetch-Mode": "navigate", "Sec-Fetch-Site": "none", "Sec-Fetch-User": "?1", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36", "sec-ch-ua-mobile": "?0" } cookies = { "JSESSIONID": "48304F9E8D55A9F2F8ACC14B7EC5A02D", "wbf__voiceplg-is": "false", "tongue": "1", "Hm_lvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712659199", "__gads": "ID", "__gpi": "UID", "__eoi": "ID", "Hm_lpvt_c31f95cd1f1c01c74f44d211939ceb8c": "1712667896", "SECKEY_ABVK": "2DPSFBW+PxohRgE9br/PahPpT7wKZzGpOzUoWKrHE14%3D", "BMAP_SECKEY": "XCSGTS0HVG9MJBd7qjmcuNCVpgwunmx3HOykd-nz4D-iFhbshz31f4mcmp3_W2DuaWoxnWstpA8--nKAgM_oHpmeq9I4YTbb3qlNksDhm1p8aAgMLY_JmRsPghK_5Cz-OHHnXHh16-fsX6GY9TW5yRhSOnFDrBnVc4V5LysnCzkEjrJ4OArZaTA6rA9Gid8tLBOeKUHh-nAGPdfN_KgAnw" } response = requests.get(one_url, headers=headers, cookies=cookies).content.decode() # print(response) parse_one(response, tynm) def parse_one(response, tynm): # 解析某一条线路 A = etree.HTML(response) # 线路名称 xlmc = A.xpath('//h1[@class="title"]/span/text()') xlmc = ''.join(xlmc) # 线路类型 xllx = A.xpath('//h1[@class="title"]/a/text()') xllx = ''.join(xllx)[1:-1] # 运行时间 yxsj = A.xpath('//ul[@class="bus-desc"]/li[1]/text()') yxsj = ''.join(yxsj).split('时间:')[-1] # 参考票价 ckpj = A.xpath('//ul[@class="bus-desc"]/li[2]/text()') ckpj = ''.join(ckpj).split('票价:')[-1] # 公交公司 gjgs = A.xpath('//ul[@class="bus-desc"]/li[3]/a/text()') gjgs = ''.join(gjgs) # 最后更新 zhgx = A.xpath('//ul[@class="bus-desc"]/li[4]/span/text()') zhgx = ''.join(zhgx).split('更新:')[-1] # 公交路线-正向 gjxl_zx = A.xpath('//div[@class="service-area"]/div[@class="bus-lzlist mb15"][1]/ol/li/a/text()') gjxl_zx = '/'.join(gjxl_zx) # 公交路线-反向 gjxl_fx = A.xpath('//div[@class="service-area"]/div[@class="bus-lzlist mb15"][2]/ol/li/a/text()') gjxl_fx = '/'.join(gjxl_fx) data = { tynm: [xlmc, xllx, yxsj, ckpj, gjgs, zhgx, gjxl_zx, gjxl_fx] } save_data(data, tynm, xlmc) def save_data(data, tynm, xlmc): # 保存数据 sheet_name = tynm if not os.path.exists(r'公交线路数据.xls'): wb = xlwt.Workbook(encoding='utf-8') sheet = wb.add_sheet(sheet_name, cell_overwrite_ok=True) header = ('线路名称', '线路类型', '运行时间', '参考票价', '公交公司', '最后更新', '公交路线-正向', '公交路线-反向') for i in range(0, len(header)): sheet.col(i).width = 2560 * 3 sheet.write(0, i, header[i]) wb.save(r'./公交线路数据.xls') wb = xlrd.open_workbook(r'公交线路数据.xls') sheets_list = wb.sheet_names() if sheet_name not in sheets_list: work = copy(wb) sh = work.add_sheet(sheet_name) header_new = ('线路名称', '线路类型', '运行时间', '参考票价', '公交公司', '最后更新', '公交路线-正向', '公交路线-反向') for index in range(0, len(header_new)): sh.col(index).width = 2560 * 3 sh.write(0, index, header_new[index]) work.save(r'./公交线路数据.xls') if os.path.exists(r'公交线路数据.xls'): wb = xlrd.open_workbook(r'公交线路数据.xls') sheets = wb.sheet_names() for i in range(len(sheets)): for name in data.keys(): worksheet = wb.sheet_by_name(sheets[i]) if worksheet.name == name: rows_old = worksheet.nrows new_workbook = copy(wb) new_worksheet = new_workbook.get_sheet(i) for num in range(0, len(data[name])): new_worksheet.write(rows_old, num, data[name][num]) new_workbook.save(r'./公交线路数据.xls') print(r'***ok: 公交线路数据: {} - {}'.format(tynm, xlmc)) if __name__ == '__main__': get_all()y
运行效果:
1)running中:
2) ending: