3.4 爬虫实战-爬去智联招聘职位信息
课程目标
爬去智联招聘
课程内容
import requests # 导入requests库,用于发送网络请求
from bs4 import BeautifulSoup # 导入BeautifulSoup库,用于解析HTML文档
from tqdm import tqdm # 导入tqdm库,用于显示进度条
import pandas as pd # 导入pandas库,用于数据处理和导出Excel文件
# 定义一个函数,用于将薪资字符串转换为数值
def tran_salary(ori_salary):
if "万" in ori_salary:
ori_salary = ori_salary.replace("万","") # 去掉“万”字
ori_salary = float(ori_salary) # 转换为浮点数
ori_salary *= 10000 # 转换为数值
elif "千" in ori_salary:
ori_salary = ori_salary.replace("千","") # 去掉“千”字
ori_salary = float(ori_salary) # 转换为浮点数
ori_salary *= 1000 # 转换为数值
return ori_salary # 返回转换后的薪资数值
# 设置请求头,模仿浏览器发送请求
headers = {
"authority": "www.zhaopin.com",
"cache-control": "max-age=0",
"sec-ch-ua": "\"Chromium\";v=\"92\", \" Not A;Brand\";v=\"99\", \"Microsoft Edge\";v=\"92\"",
"sec-ch-ua-mobile": "?0",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.131 Safari/537.36 Edg/92.0.902.67",
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"sec-fetch-site": "same-origin",
"sec-fetch-mode": "navigate",
"sec-fetch-user": "?1",
"sec-fetch-dest": "document",
"referer": "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1",
"accept-language": "zh-CN,zh;q=0.9"
}
# 设置cookies,用于维持会话
cookies = {
# 省略了具体的cookie值,这些值通常在登录后由浏览器保存
}
# 定义要爬取的URL
url = "https://www.zhaopin.com/sou/jl801/kw01O00U80EG06G03F01N0/p1"
# 使用requests.get发送请求,获取网页内容
response = requests.get(url, headers=headers, cookies=cookies)
# 使用BeautifulSoup解析网页内容
html_str = response.text
soup = BeautifulSoup(html_str,"html.parser")
# 找到所有职位信息的HTML元素
joblist = soup.find_all("div",class_="joblist-box__item")
# 初始化一个空列表,用于存储抓取的数据
infos = []
# 使用for循环遍历每个职位信息元素
for job_item in tqdm(joblist): # tqdm用于显示进度条
# 获取职位名称和公司名称
jobinfo__name = job_item.find("a",class_="jobinfo__name").text.strip()
company_name = job_item.find("a",class_="companyinfo__name").text.strip()
# 获取薪资信息
jobinfo__salary = job_item.find("p",class_="jobinfo__salary").text.strip()
if jobinfo__salary == '面议':
salary = 0
else:
if "·" in jobinfo__salary:
jobinfo__salary = jobinfo__salary.split("·")[0]
min_salary,max_salary = jobinfo__salary.split("-")
min_salary = tran_salary(min_salary)
max_salary = tran_salary(max_salary)
salary = (min_salary + max_salary) / 2 # 计算平均薪资
# 获取技能要求
jobinfo__tag = job_item.find("div",class_="jobinfo__tag")
skills = [] # 技能要求列表
if jobinfo__tag is not None:
joblist_box__item_tags = jobinfo__tag.findAll("div")
for joblist_box__item_tag in joblist_box__item_tags:
skills.append(joblist_box__item_tag.text)
# 获取其他信息,如地区、经验要求、学历要求
jobinfo__other_info = job_item.find("div",class_="jobinfo__other-info")
jobinfo__other_infos = jobinfo__other_info.find_all("div")
area = jobinfo__other_infos[0].text.strip()
area_strs = area.split("·")
region, classify, city = "","",""
if len(area_strs) > 2:
region = area_strs[2]
if len(area_strs) > 1:
classify = area_strs[1]
if len(area_strs) > 0:
city = area_strs[0]
experience_requirement = jobinfo__other_infos[1].text.strip()
if experience_requirement == "经验不限":
experience_requirement = "0"
experience_requirement = experience_requirement.replace("年","")
if "-" in experience_requirement:
experience_requirement_list = experience_requirement.split("-")
experience_requirement = experience_requirement_list[0]
experience_requirement = int(experience_requirement)
education_background_requirement = jobinfo__other_infos[2].text.strip()
# 获取公司信息,如融资信息、公司规模、公司类型
companyinfo__tag = job_item.find("div",class_="companyinfo__tag")
comany_info_items = companyinfo__tag.findAll("div")
finance_info = comany_info_items[0].text.strip()
scale = comany_info_items[1].text.strip()
if len(comany_info_items) > 2:
conany_type = comany_info_items[2].text.strip()
else:
conany_type = ""
# 将抓取的信息以字典形式添加到infos列表中
info = {
"公司名字": company_name,
"薪资": salary,
"技能要求": skills,
"市": city,
"区": classify,
"区域": region,
"经验要求": experience_requirement,
"学历要求": education_background_requirement,
"融资信息": finance_info,
"规模": scale,
"公司类型": conany_type
}
infos.append(info)
# 使用pandas将infos列表转换为DataFrame
df = pd.DataFrame(infos)
# 将DataFrame保存为Excel文件
df.to_excel("智联职位信息.xlsx", index=False)