新闻爬虫
from bs4 import BeautifulSoup
import requests
import sys
import random
import pymysql
links = []
datas = []
hea = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2272.118 Safari/537.36'
}
urls =[
"https://www.chinanews.com/china.shtml",
"https://www.chinanews.com/society.shtml",
"https://www.chinanews.com/compatriot.shtml",
"https://www.chinanews.com/wenhua.shtml",
"https://www.chinanews.com/world.shtml",
"https://www.chinanews.com/cj/gd.shtml",
"https://www.chinanews.com/sports.shtml",
"https://www.chinanews.com/huaren.shtml"
]
db = pymysql.connect(host='127.0.0.1', user='root', password='123456', port=3396, db='news_recommendation_system')
cursor = db.cursor()
def main():
baseurl = 'https://www.chinanews.com/taiwan.shtml'
getLink(baseurl)
getInformationAndSave()
db.close()
def getInformationAndSave():
for link in links:
data = []
url = "https://www.chinanews.com" + link[1]
cur_html = requests.get(url, headers=hea)
cur_html.encoding = "utf8"
soup = BeautifulSoup(cur_html.text, 'html.parser')
title = soup.find('h1')
title = title.text.strip()
tr = soup.find('div', class_='left-t').text.split()
time = tr[0] + tr[1]
recourse = tr[2]
cont = soup.find('div', class_="left_zw")
content = cont.text.strip()
print(link[0] + "---" + title + "---" + time + "---" + recourse + "---" + url)
saveDate(title,content,time,recourse,url)
def deleteDate():
sql = "DELETE FROM news "
try:
cursor.execute(sql)
db.commit()
except:
db.rollback()
def saveDate(title,content,time,recourse,url):
try:
cursor.execute("INSERT INTO news(news_title, news_content, type_id, news_creatTime, news_recourse,news_link) VALUES ('%s', '%s', '%s', '%s', '%s' ,'%s')" % \
(title, content, random.randint(1,8), time, recourse,url))
db.commit()
print("执行成功")
except:
db.rollback()
print("执行失败")
def getLink(baseurl):
html = requests.get(baseurl, headers=hea)
html.encoding = 'utf8'
soup = BeautifulSoup(html.text, 'html.parser')
for item in soup.select('div.content_list > ul > li'):
if (item.a == None):
continue
data = []
type = item.div.text[1:3]
link = item.div.next_sibling.next_sibling.a['href']
data.append(type)
data.append(link)
links.append(data)
if __name__ == '__main__':
main()