当前位置：首页 > article >正文

Python 爬虫中的解析方法

article 2025/2/21 5:54:44

1. 使用 BeautifulSoup 解析 HTML

如果商品描述是通过HTML页面获取的，可以使用BeautifulSoup库来解析HTML内容。

示例代码：

import requests
from bs4 import BeautifulSoup

def get_product_description(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        # 假设商品描述在某个特定的<div>标签中
        description = soup.find('div', class_='product-description').text.strip()
        return description
    else:
        print(f"请求失败，状态码：{response.status_code}")
        return None

# 示例用法
url = "https://example.com/product"
description = get_product_description(url)
if description:
    print("商品描述:", description)

2. 使用 PyQuery 解析 HTML

PyQuery是一个类似于jQuery的Python库，可以更方便地解析HTML。

示例代码：

from pyquery import PyQuery as pq

def get_product_description(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        doc = pq(response.text)
        # 假设商品描述在某个特定的<div>标签中
        description = doc('div.product-description').text().strip()
        return description
    else:
        print(f"请求失败，状态码：{response.status_code}")
        return None

# 示例用法
url = "https://example.com/product"
description = get_product_description(url)
if description:
    print("商品描述:", description)

3. 解析 JSON 数据

如果商品描述是通过API接口以JSON格式返回的，可以使用json模块解析数据。

示例代码：

import json

def get_product_description(api_url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    response = requests.get(api_url, headers=headers)
    if response.status_code == 200:
        data = response.json()
        # 假设商品描述在JSON的某个字段中
        description = data.get('product', {}).get('description', '')
        return description
    else:
        print(f"请求失败，状态码：{response.status_code}")
        return None

# 示例用法
api_url = "https://api.example.com/product/12345"
description = get_product_description(api_url)
if description:
    print("商品描述:", description)

Java 爬虫中的解析方法

1. 使用 Jsoup 解析 HTML

Jsoup是一个强大的Java库，用于解析HTML文档。

示例代码：

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;

public class ProductDescriptionCrawler {
    public static void main(String[] args) {
        String url = "https://example.com/product";
        String description = getProductDescription(url);
        if (description != null) {
            System.out.println("商品描述: " + description);
        }
    }

    public static String getProductDescription(String url) {
        try {
            Document doc = Jsoup.connect(url)
                    .userAgent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
                    .get();
            Elements descriptionElement = doc.select("div.product-description");
            if (!descriptionElement.isEmpty()) {
                return descriptionElement.text();
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
}

2. 解析 JSON 数据

如果商品描述是通过API接口以JSON格式返回的，可以使用org.json库解析数据。

示例代码：

import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.json.JSONObject;

import java.io.IOException;

public class ProductDescriptionCrawler {
    public static void main(String[] args) {
        String apiUrl = "https://api.example.com/product/12345";
        String description = getProductDescription(apiUrl);
        if (description != null) {
            System.out.println("商品描述: " + description);
        }
    }

    public static String getProductDescription(String apiUrl) {
        try (CloseableHttpClient client = HttpClients.createDefault()) {
            HttpGet httpGet = new HttpGet(apiUrl);
            httpGet.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3");
            try (CloseableHttpResponse response = client.execute(httpGet)) {
                if (response.getStatusLine().getStatusCode() == 200) {
                    String jsonResponse = EntityUtils.toString(response.getEntity());
                    JSONObject jsonObject = new JSONObject(jsonResponse);
                    return jsonObject.getJSONObject("product").getString("description");
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
        return null;
    }
}