当前位置：首页 > article >正文

使用python自动提取文本关键词

article 2025/3/11 2:04:04

使用殷本纪文字内容：汤出，见野张网四面，祝曰：“自天下四方皆入吾网。”汤曰：“嘻，尽之矣！”乃去其三面，祝曰：“欲左，左。欲右，右。不用命，乃入吾网。”诸侯闻之，曰：“汤德至矣，及禽兽。”........

import jieba
import jieba.posseg as pseg
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from collections import defaultdict
import math
import os
from pyvis.network import Network
from networkx.algorithms import community


class PathVisualizer:
    def __init__(self, stopwords_path='stopwords.txt', window_size=5, damping=0.85):
        """初始化文本路径可视化器

        Args:
            stopwords_path: 停用词文件路径
            window_size: 共现窗口大小
            damping: 随机游走阻尼系数
        """
        self.window_size = window_size
        self.damping = damping
        self.stopwords = self._load_stopwords(stopwords_path)
        jieba.initialize()
        self.pos_filter = {'n', 'v', 'a', 'x'}  # 包含专业术语的词性过滤

    def _load_stopwords(self, path):
        """加载停用词表"""
        default_stopwords = {'的', '了', '在', '是', '和', '就', '不'}
        try:
            with open(path, 'r', encoding='utf-8') as f:
                return set([line.strip() for line in f]) | default_stopwords
        except FileNotFoundError:
            print(f"警告：未找到停用词文件 {path}，使用默认停用词")
            return default_stopwords

    def _text_segment(self, text):
        """专业文本预处理"""
        words = []
        for word, pos in pseg.cut(text):
            if pos[0].lower() in self.pos_filter or 'x' in pos.lower():
                if word not in self.stopwords and len(word) > 1:
                    words.append(word)
        return words

    def _build_cooccurrence_graph(self, words):
        """构建多目标共现网络"""
        graph = nx.DiGraph()
        node_weights = defaultdict(int)

        for i in range(len(words)):
            window_start = max(0, i - self.window_size)
            current_word = words[i]
            node_weights[current_word] += 1 / (i + 1)  # 位置加权

            # 创建共现关系
            for j in range(window_start, i):
                prev_word = words[j]
                weight = 1 / (i - j)  # 距离衰减因子
                if graph.has_edge(prev_word, current_word):
                    graph[prev_word][current_word]['weight'] += weight
                else:
                    graph.add_edge(prev_word, current_word, weight=weight)

        # 添加节点属性
        nx.set_node_attributes(graph, dict(node_weights), 'weight')
        return graph

    def _classify_regions(self, graph):
        """使用Louvain算法进行社区划分"""
        undi_graph = graph.to_undirected()
        communities = community.louvain_communities(undi_graph, resolution=0.8)
        regions = {}
        for i, comm in enumerate(communities):
            for node in comm:
                regions[node] = chr(65 + i)  # 按社区编号转为字母
        return regions

    def visualize_network(self, graph, output_path, regions):
        """静态可视化方法"""
        plt.figure(figsize=(24, 14))

        # 优化布局算法
        pos = nx.kamada_kawai_layout(graph, scale=2)

        # 动态调整可视化参数
        node_sizes = [800 * math.log(graph.nodes[node]['weight'] + 1) for node in graph.nodes]
        edge_widths = [0.3 + 0.5 * math.log(graph[u][v]['weight'] + 1) for u, v in graph.edges]

        # 绘制节点（使用修正后的颜色格式）
        nx.draw_networkx_nodes(
            graph, pos,
            node_size=node_sizes,
            node_color=[self._region_color(regions[node]) for node in graph.nodes],
            edgecolors=(1, 1, 1, 0.8),  # 白色带透明度
            alpha=0.7
        )

        # 绘制边（使用RGBA元组格式）
        nx.draw_networkx_edges(
            graph, pos,
            width=edge_widths,
            edge_color=(0.5, 0.5, 0.5, 0.4),  # RGB(128,128,128)转换为归一化值
            arrowsize=15,
            alpha=0.4
        )

        # 添加智能标签
        self._draw_smart_labels(graph, pos)

        # 添加图例和注释
        self._add_legends()
        self._add_community_annotations(pos, regions)

        plt.axis('off')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()

    def _region_color(self, region):
        """区域颜色映射（使用Matplotlib兼容格式）"""
        colors = {
            'A': (0.18, 0.31, 0.31, 0.8),  # #2F4F4F -> RGBA
            'B': (0.47, 0.53, 0.60, 0.7),  # #778899
            'C': (0.66, 0.66, 0.66, 0.6),  # #A9A9A9
            'D': (0.83, 0.83, 0.83, 0.5),  # #D3D3D3
            'E': (0.94, 0.97, 1.00, 0.4)  # #F0F8FF
        }
        return colors.get(region, (1, 1, 1, 1))

    def _draw_smart_labels(self, graph, pos):
        """智能标签显示策略"""
        weights = np.array(list(nx.get_node_attributes(graph, 'weight').values()))
        threshold = np.percentile(weights, 75)

        # 仅标注重要节点
        labels = {node: node for node in graph.nodes if graph.nodes[node]['weight'] > threshold}
        nx.draw_networkx_labels(
            graph, pos, labels,
            font_size=10,
            font_family='SimHei',
            alpha=0.9,
            bbox=dict(facecolor=(1, 1, 1, 0.7), edgecolor='none')  # 半透明白色背景
        )

    def _add_legends(self):
        """添加自定义图例"""
        legend_elements = [
            plt.Line2D([0], [0], marker='o', color='w', label='核心节点',
                       markerfacecolor=(0.18, 0.31, 0.31, 0.8), markersize=15),
            plt.Line2D([0], [0], marker='o', color='w', label='次要节点',
                       markerfacecolor=(0.47, 0.53, 0.60, 0.7), markersize=12),
            plt.Line2D([0], [0], color=(0.5, 0.5, 0.5, 0.4), lw=3, label='关联强度')
        ]
        plt.legend(handles=legend_elements, loc='upper left', fontsize=12, framealpha=0.9)

    def _add_community_annotations(self, pos, regions):
        """添加社区标注（使用正确颜色格式）"""
        region_centers = defaultdict(list)
        for node, reg in regions.items():
            region_centers[reg].append(pos[node])

        for reg, coords in region_centers.items():
            x, y = zip(*coords)
            center = (np.mean(x), np.mean(y))
            plt.annotate(
                f'社区 {reg}',
                xy=center,
                xytext=(center[0], center[1] + 0.15),
                ha='center',
                fontsize=14,
                weight='bold',
                color=(0.80, 0.40, 0.40, 0.9),  # #FF6B6B -> RGBA
                arrowprops=dict(arrowstyle='->', color=(0.80, 0.40, 0.40, 0.7))
            )

    def generate_interactive_network(self, graph, regions, output_html):
        """生成交互式可视化（保持原始颜色格式）"""
        net = Network(height='800px', width='100%', notebook=False, bgcolor='#ffffff')

        # 颜色转换函数
        def to_hex(rgba):
            return "#{:02x}{:02x}{:02x}".format(
                int(rgba[0] * 255), int(rgba[1] * 255), int(rgba[2] * 255)
            )

        # 添加节点
        for node in graph.nodes:
            net.add_node(
                node,
                label=node,
                color=to_hex(self._region_color(regions[node])),
                size=15 * math.log(graph.nodes[node]['weight'] + 1),
                title=f"关键词: {node}<br>权重: {graph.nodes[node]['weight']:.2f}"
            )

        # 添加边
        for u, v in graph.edges:
            net.add_edge(
                u, v,
                width=0.3 * math.log(graph[u][v]['weight'] + 1),
                color='rgba(128,128,128,0.3)',  # PyVis支持CSS颜色格式
                title=f"关联强度: {graph[u][v]['weight']:.2f}"
            )

        # 物理引擎配置
        net.repulsion(
            node_distance=200,
            central_gravity=0.3,
            spring_length=150,
            damping=0.9
        )

        # 添加控制面板
        net.show_buttons(filter_=['physics', 'nodes', 'edges'])
        net.save_graph(output_html)

    def process_text(self, file_path):
        """完整处理流程"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"文件路径不存在：{file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # 文本处理
        words = self._text_segment(text)
        if len(words) < 10:
            raise ValueError("文本内容过短，需至少包含10个有效词汇")

        # 构建网络
        graph = self._build_cooccurrence_graph(words)
        regions = self._classify_regions(graph)

        return graph, regions


# 使用示例
if __name__ == "__main__":
    # 输入输出配置
    input_file = r"D:\daku\1980.txt"
    static_output = r"D:\daku\static_vis.png"
    interactive_output = r"D:\daku\interactive_vis.html"

    # 初始化处理器
    processor = PathVisualizer()

    try:
        # 处理文本
        graph, regions = processor.process_text(input_file)

        # 生成两种可视化
        processor.visualize_network(graph, static_output, regions)
        processor.generate_interactive_network(graph, regions, interactive_output)

        print(f"静态可视化已保存至：{static_output}")
        print(f"交互式可视化已保存至：{interactive_output}")

    except Exception as e:
        print(f"处理过程中发生错误：{str(e)}")

查看全文

http://www.kler.cn/a/579349.html