当前位置: 首页 > article >正文

使用python自动提取文本关键词

使用殷本纪文字内容:汤出,见野张网四面,祝曰:“自天下四方皆入吾网。”汤曰:“嘻,尽之矣!”乃去其三面,祝曰:“欲左,左。欲右,右。不用命,乃入吾网。”诸侯闻之,曰:“汤德至矣,及禽兽。”........

import jieba
import jieba.posseg as pseg
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse
from collections import defaultdict
import math
import os
from pyvis.network import Network
from networkx.algorithms import community


class PathVisualizer:
    def __init__(self, stopwords_path='stopwords.txt', window_size=5, damping=0.85):
        """初始化文本路径可视化器

        Args:
            stopwords_path: 停用词文件路径
            window_size: 共现窗口大小
            damping: 随机游走阻尼系数
        """
        self.window_size = window_size
        self.damping = damping
        self.stopwords = self._load_stopwords(stopwords_path)
        jieba.initialize()
        self.pos_filter = {'n', 'v', 'a', 'x'}  # 包含专业术语的词性过滤

    def _load_stopwords(self, path):
        """加载停用词表"""
        default_stopwords = {'的', '了', '在', '是', '和', '就', '不'}
        try:
            with open(path, 'r', encoding='utf-8') as f:
                return set([line.strip() for line in f]) | default_stopwords
        except FileNotFoundError:
            print(f"警告:未找到停用词文件 {path},使用默认停用词")
            return default_stopwords

    def _text_segment(self, text):
        """专业文本预处理"""
        words = []
        for word, pos in pseg.cut(text):
            if pos[0].lower() in self.pos_filter or 'x' in pos.lower():
                if word not in self.stopwords and len(word) > 1:
                    words.append(word)
        return words

    def _build_cooccurrence_graph(self, words):
        """构建多目标共现网络"""
        graph = nx.DiGraph()
        node_weights = defaultdict(int)

        for i in range(len(words)):
            window_start = max(0, i - self.window_size)
            current_word = words[i]
            node_weights[current_word] += 1 / (i + 1)  # 位置加权

            # 创建共现关系
            for j in range(window_start, i):
                prev_word = words[j]
                weight = 1 / (i - j)  # 距离衰减因子
                if graph.has_edge(prev_word, current_word):
                    graph[prev_word][current_word]['weight'] += weight
                else:
                    graph.add_edge(prev_word, current_word, weight=weight)

        # 添加节点属性
        nx.set_node_attributes(graph, dict(node_weights), 'weight')
        return graph

    def _classify_regions(self, graph):
        """使用Louvain算法进行社区划分"""
        undi_graph = graph.to_undirected()
        communities = community.louvain_communities(undi_graph, resolution=0.8)
        regions = {}
        for i, comm in enumerate(communities):
            for node in comm:
                regions[node] = chr(65 + i)  # 按社区编号转为字母
        return regions

    def visualize_network(self, graph, output_path, regions):
        """静态可视化方法"""
        plt.figure(figsize=(24, 14))

        # 优化布局算法
        pos = nx.kamada_kawai_layout(graph, scale=2)

        # 动态调整可视化参数
        node_sizes = [800 * math.log(graph.nodes[node]['weight'] + 1) for node in graph.nodes]
        edge_widths = [0.3 + 0.5 * math.log(graph[u][v]['weight'] + 1) for u, v in graph.edges]

        # 绘制节点(使用修正后的颜色格式)
        nx.draw_networkx_nodes(
            graph, pos,
            node_size=node_sizes,
            node_color=[self._region_color(regions[node]) for node in graph.nodes],
            edgecolors=(1, 1, 1, 0.8),  # 白色带透明度
            alpha=0.7
        )

        # 绘制边(使用RGBA元组格式)
        nx.draw_networkx_edges(
            graph, pos,
            width=edge_widths,
            edge_color=(0.5, 0.5, 0.5, 0.4),  # RGB(128,128,128)转换为归一化值
            arrowsize=15,
            alpha=0.4
        )

        # 添加智能标签
        self._draw_smart_labels(graph, pos)

        # 添加图例和注释
        self._add_legends()
        self._add_community_annotations(pos, regions)

        plt.axis('off')
        plt.savefig(output_path, dpi=300, bbox_inches='tight')
        plt.close()

    def _region_color(self, region):
        """区域颜色映射(使用Matplotlib兼容格式)"""
        colors = {
            'A': (0.18, 0.31, 0.31, 0.8),  # #2F4F4F -> RGBA
            'B': (0.47, 0.53, 0.60, 0.7),  # #778899
            'C': (0.66, 0.66, 0.66, 0.6),  # #A9A9A9
            'D': (0.83, 0.83, 0.83, 0.5),  # #D3D3D3
            'E': (0.94, 0.97, 1.00, 0.4)  # #F0F8FF
        }
        return colors.get(region, (1, 1, 1, 1))

    def _draw_smart_labels(self, graph, pos):
        """智能标签显示策略"""
        weights = np.array(list(nx.get_node_attributes(graph, 'weight').values()))
        threshold = np.percentile(weights, 75)

        # 仅标注重要节点
        labels = {node: node for node in graph.nodes if graph.nodes[node]['weight'] > threshold}
        nx.draw_networkx_labels(
            graph, pos, labels,
            font_size=10,
            font_family='SimHei',
            alpha=0.9,
            bbox=dict(facecolor=(1, 1, 1, 0.7), edgecolor='none')  # 半透明白色背景
        )

    def _add_legends(self):
        """添加自定义图例"""
        legend_elements = [
            plt.Line2D([0], [0], marker='o', color='w', label='核心节点',
                       markerfacecolor=(0.18, 0.31, 0.31, 0.8), markersize=15),
            plt.Line2D([0], [0], marker='o', color='w', label='次要节点',
                       markerfacecolor=(0.47, 0.53, 0.60, 0.7), markersize=12),
            plt.Line2D([0], [0], color=(0.5, 0.5, 0.5, 0.4), lw=3, label='关联强度')
        ]
        plt.legend(handles=legend_elements, loc='upper left', fontsize=12, framealpha=0.9)

    def _add_community_annotations(self, pos, regions):
        """添加社区标注(使用正确颜色格式)"""
        region_centers = defaultdict(list)
        for node, reg in regions.items():
            region_centers[reg].append(pos[node])

        for reg, coords in region_centers.items():
            x, y = zip(*coords)
            center = (np.mean(x), np.mean(y))
            plt.annotate(
                f'社区 {reg}',
                xy=center,
                xytext=(center[0], center[1] + 0.15),
                ha='center',
                fontsize=14,
                weight='bold',
                color=(0.80, 0.40, 0.40, 0.9),  # #FF6B6B -> RGBA
                arrowprops=dict(arrowstyle='->', color=(0.80, 0.40, 0.40, 0.7))
            )

    def generate_interactive_network(self, graph, regions, output_html):
        """生成交互式可视化(保持原始颜色格式)"""
        net = Network(height='800px', width='100%', notebook=False, bgcolor='#ffffff')

        # 颜色转换函数
        def to_hex(rgba):
            return "#{:02x}{:02x}{:02x}".format(
                int(rgba[0] * 255), int(rgba[1] * 255), int(rgba[2] * 255)
            )

        # 添加节点
        for node in graph.nodes:
            net.add_node(
                node,
                label=node,
                color=to_hex(self._region_color(regions[node])),
                size=15 * math.log(graph.nodes[node]['weight'] + 1),
                title=f"关键词: {node}<br>权重: {graph.nodes[node]['weight']:.2f}"
            )

        # 添加边
        for u, v in graph.edges:
            net.add_edge(
                u, v,
                width=0.3 * math.log(graph[u][v]['weight'] + 1),
                color='rgba(128,128,128,0.3)',  # PyVis支持CSS颜色格式
                title=f"关联强度: {graph[u][v]['weight']:.2f}"
            )

        # 物理引擎配置
        net.repulsion(
            node_distance=200,
            central_gravity=0.3,
            spring_length=150,
            damping=0.9
        )

        # 添加控制面板
        net.show_buttons(filter_=['physics', 'nodes', 'edges'])
        net.save_graph(output_html)

    def process_text(self, file_path):
        """完整处理流程"""
        if not os.path.exists(file_path):
            raise FileNotFoundError(f"文件路径不存在:{file_path}")

        with open(file_path, 'r', encoding='utf-8') as f:
            text = f.read()

        # 文本处理
        words = self._text_segment(text)
        if len(words) < 10:
            raise ValueError("文本内容过短,需至少包含10个有效词汇")

        # 构建网络
        graph = self._build_cooccurrence_graph(words)
        regions = self._classify_regions(graph)

        return graph, regions


# 使用示例
if __name__ == "__main__":
    # 输入输出配置
    input_file = r"D:\daku\1980.txt"
    static_output = r"D:\daku\static_vis.png"
    interactive_output = r"D:\daku\interactive_vis.html"

    # 初始化处理器
    processor = PathVisualizer()

    try:
        # 处理文本
        graph, regions = processor.process_text(input_file)

        # 生成两种可视化
        processor.visualize_network(graph, static_output, regions)
        processor.generate_interactive_network(graph, regions, interactive_output)

        print(f"静态可视化已保存至:{static_output}")
        print(f"交互式可视化已保存至:{interactive_output}")

    except Exception as e:
        print(f"处理过程中发生错误:{str(e)}")


http://www.kler.cn/a/579349.html

相关文章:

  • 【文献阅读】On-Device Language Models: A Comprehensive Review
  • LVGL开发说明
  • YOLOv10改进之MHAF(多分支辅助特征金字塔)
  • 【爬虫】开篇词
  • SSH/HTTP/HTTPS
  • Manus:革新未来的智能助手
  • 用R语言的XML库写一个采集图片的爬虫程序
  • 基于编译器特性浅析C++程序性能优化
  • 没有最好的,只有最合适的:重新认识测试工具的价值
  • 第十五届蓝桥杯----B组cpp----真题解析(小白版本)
  • PostgreSQL、SQL Server和MySQL数据库性能调优与故障排除技术
  • 用向量数据库建立本地知识库
  • Unity DOTS从入门到精通之 C# Job System
  • 【Recon】CTF Web类题目主要类型
  • 全栈网络安全|渗透测试-1
  • style-your-video风格化你的视频
  • 第七课:Python反爬攻防战:Headers/IP代理与验证码
  • Vue 3 的面试题
  • 【音视频】ffmpeg音视频处理基本流程
  • numpy常用函数详解