爬虫下载网页文夹
爬虫下载网页pdf文件
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from urllib.parse import urljoin, unquote
from tqdm import tqdm
# 设置网页的URL
base_url = "http://119/download/dzz/pdf/"
# 创建保存文件的文件夹,如果文件夹不存在则创建
download_folder = "downloaded_pdfs"
os.makedirs(download_folder, exist_ok=True)
def get_name(base_url):
name_list=[]
# 获取网页内容
response = requests.get(base_url) # 发送请求获取网页内容
response.raise_for_status() # 检查请求是否成功,若不成功则抛出异常
soup = BeautifulSoup(response.text, "html.parser") # 解析HTML内容
# 查找所有文件链接
for link in soup.find_all("a"): # 遍历网页中所有的<a>标签,找到每个链接
name_list.append(link.get("href"))
return name_list
f1_list=get_name(base_url)
for i1 in f1_list:
if i1 and i1 not in ["../"]:
print(i1)
f1_url = urljoin(base_url, i1) # 将相对链接转为完整的URL
f2_list=get_name(f1_url)
for i2 in f2_list:
if i2 and i2 not in ["../"]:
f2_url = urljoin(f1_url, i2) # 将相对链接转为完整的URL
f3_list=get_name(f2_url)
for i3 in tqdm(f3_list):
if i3 and i3 not in ["../"]:
f3_url = urljoin(f2_url, i3) # 将相对链接转为完整的URL
response = requests.get(f3_url) # 发送请求获取网页内容
response.raise_for_status() # 检查请求是否成功,若不成功则抛出异常
file_response = requests.get(f3_url) # 发送请求下载文件内容
download_folder_new = download_folder+'/'+i1+i2
download_folder_new=unquote(download_folder_new)
os.makedirs(download_folder_new, exist_ok=True)
download_folder_path=download_folder_new+i3
download_folder_path = unquote(download_folder_path)
with open(download_folder_path, "wb") as f: # 以二进制写入方式保存文件
f.write(file_response.content) # 写入下载的文件内容到本地
# print(f"{download_folder_path}下载完成") # 打印下载完成信息