当前位置：首页 > article >正文

Mac使用pycharm+基于Kaggle的社交媒体情绪分析数据集，用python做词云的可视化

article 2025/3/19 17:05:57

pycharm版本
一开始用的专业版，但是太久没有写代码就账户过期了，找半天Activation Code也没有找到，重新下载一个社区版，我点进去是社区版的页面，但是下载结果是专业版，后面仔细看，mad社区版在下面，得看清楚才下载。
mad下载完Mac打不开，原因是芯片不匹配，我自己Apple XX，得下载Apple silicon的，而不是英特尔，下载成功打开就可以用。
下载所要的包

直接终端输入pip intall xx就可以。完了用pip list看一眼下载好没有
在这里插入图片描述
3. 收集数据
https://www.kaggle.com/datasets/kashishparmar02/social-media-sentiments-analysis-dataset

用元宝帮我生成代码

# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import ssl

# 修复SSL证书错误
ssl._create_default_https_context = ssl._create_unverified_context

# 下载NLTK资源
nltk.download('punkt')
nltk.download('stopwords')


# -------------------
# 数据预处理函数：（典型操作：小写转换、去除标点、分词（中文需用jieba）、去除停用词）
# -------------------
def clean_text(text):
    # 处理可能的非字符串输入
    text = str(text)

    custom_stopwords = {'user', 'http', 'https', 'www', 'com', 'rt', 'amp'}
    stop_words = set(stopwords.words('english')).union(custom_stopwords)

    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    return ' '.join(filtered_tokens)


# -------------------
# 主程序
# -------------------
if __name__ == "__main__":
    try:
        # 加载数据（使用正确的列名）
        df = pd.read_csv('social_media_sentiment.csv')
        print("数据加载成功！前5行样例：")
        print(df[['Text', 'Sentiment']].head())
    except FileNotFoundError:
        print("错误：文件未找到，请确认文件路径和名称是否正确")
        exit()
    except Exception as e:
        print(f"加载数据时发生意外错误：{str(e)}")
        exit()

    # 检查必要列是否存在
    required_columns = ['Text', 'Sentiment']
    missing_cols = [col for col in required_columns if col not in df.columns]
    if missing_cols:
        print(f"错误：数据集中缺少必要列：{missing_cols}")
        exit()

    # 数据清洗
    try:
        df['cleaned_text'] = df['Text'].apply(clean_text)
    except KeyError:
        print("错误：数据集中无 'Text' 列")
        exit()

    # 生成整体词云
    all_text = ' '.join(df['cleaned_text'].dropna())
    wordcloud = WordCloud(
        width=1600,
        height=800,
        background_color='white',
        max_words=200,
        colormap='viridis',
        collocations=False
    ).generate(all_text)

    plt.figure(figsize=(20, 10))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Overall Sentiment Word Cloud', fontsize=24, pad=20)
    plt.tight_layout()
    plt.savefig('overall_wordcloud.png', dpi=300)
    plt.show()

    # 生成分类词云
    sentiment_colors = {
        'Positive': 'Greens',
        'Negative': 'Reds',
        'Neutral': 'Blues',
        'Anger': 'Oranges',
        'Fear': 'Purples',
        'Happiness': 'YlOrBr',
        'Joy': 'YlGn'
    }

    # 获取主要情感分类（出现次数前10的）
    top_sentiments = df['Sentiment'].value_counts().nlargest(10).index.tolist()

    # 创建子图布局
    cols = 3
    rows = (len(top_sentiments) + cols - 1) // cols
    fig, axes = plt.subplots(rows, cols, figsize=(24, rows * 8))
    axes = axes.flatten()

    for idx, sentiment in enumerate(top_sentiments):
        if idx >= len(axes):  # 防止索引超出范围
            break

        text = ' '.join(df[df['Sentiment'] == sentiment]['cleaned_text'].dropna())
        if not text:
            continue

        color = sentiment_colors.get(sentiment, 'viridis')
        wc = WordCloud(
            width=800,
            height=600,
            background_color='white',
            max_words=100,
            colormap=color,
            collocations=False
        ).generate(text)

        axes[idx].imshow(wc, interpolation='bilinear')
        axes[idx].set_title(f'{sentiment} Sentiment', fontsize=18, pad=10)
        axes[idx].axis('off')
        wc.to_file(f"{sentiment.lower()}_wordcloud.png")

    # 隐藏多余的子图
    for j in range(len(top_sentiments), len(axes)):
        axes[j].axis('off')

    plt.tight_layout()
    plt.savefig('sentiment_wordclouds.png', dpi=300)
    plt.show()

    print("词云已生成并保存为PNG文件！")

代码详解

‌NLTK（Natural Language Toolkit）是一个开源的Python工具包，专门用于自然语言处理（NLP）的研究和开发‌。它由Steven Bird和Edward Loper在宾夕法尼亚大学计算机和信息科学系开发，提供了丰富的模块、数据集和算法，适用于NLP入门学习和实验性项目开发‌。
‌丰富的内置功能‌：NLTK支持多种NLP任务，如文本预处理（分词、去停用词、词性标注等）、语言模型和统计计算、命名实体识别、语法分析与句法树构建等，无需额外安装复杂组件‌。
（NLTK处理文本，以方便将文本像数字数据一样进行分析）

停用词是指在信息检索中,为节省存储空间和提高搜索效率,在处理自然语言数据(或文本)之前或之后会自动过滤掉某些字或词,这些字或词即被称为Stop Words(停用词)。
（过滤不必要的分析词，提高效率）

‌SSL（Secure Sockets Layer）‌是一种安全协议，用于在互联网通信中确保数据传输的安全，通过在客户端和服务器之间建立一个加密通道，保护传输数据免受窃听和篡改（对传送的数据进行加密和隐藏；确保数据在传送中不被改变，即数据的完整性）。
SSL协议包括两个主要版本：SSL 3.0和TLS（Transport Layer Security），后者是前者的发展和替代品。
SSL证书错误‌是指在SSL/TLS握手过程中，浏览器无法验证服务器的SSL证书，从而显示错误信息。
（需要从互联网下载资源（如预训练模型、数据集或依赖库）时会用到）

‌YlOrBr‌：‘YlOrBr’（Yellow-Orange-Brown）是一种颜色映射方案，从黄色到橙色再到棕色。这种颜色方案常用于表示数据的变化范围，黄色通常代表较低的值，橙色和棕色则代表较高的值。在Matplotlib库中，可以通过plt.cm.YlOrBr来获取这种颜色映射对象‌

‌YlGn‌：‘YlGn’（Yellow-Green）是一种从黄色到绿色的颜色映射方案。这种颜色方案通常用于表示从低到高的数据变化，黄色代表较低的值，绿色代表较高的值。在seaborn库中，可以通过sns.color_palette(‘YlGn’)来获取这种颜色方案‌

出现问题，没有Negative的词云，修改代码

# -*- coding: utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import ssl

# 修复SSL证书错误
ssl._create_default_https_context = ssl._create_unverified_context
nltk.download('punkt')
nltk.download('stopwords')


def clean_text(text):
    text = str(text)
    custom_stopwords = {'http', 'https', 'www', 'com', 'rt', 'amp'}
    base_stopwords = set(stopwords.words('english')) - {'not', 'no', 'never', 'nor', 'none'}
    stop_words = base_stopwords.union(custom_stopwords)
    text = re.sub(r"(?:[:=8;][\-o\*']?[)>\]]|[(\[<][\-o\*']?[:=8;])", "", text)
    text = re.sub(r'[^\w\s\'’]', ' ', text)
    text = text.lower()
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words and len(word) >= 1]
    return ' '.join(filtered_tokens)


if __name__ == "__main__":
    # 数据加载与预处理
    try:
        df = pd.read_csv('social_media_sentiment.csv')
        df['Sentiment'] = df['Sentiment'].str.strip().str.capitalize().replace({
            'Pos': 'Positive', 'Neg': 'Negative', 'Neu': 'Neutral'
        })
        target_sentiments = ['Positive', 'Negative', 'Neutral']
        df = df[df['Sentiment'].isin(target_sentiments)].copy()
    except Exception as e:
        print(f"数据加载失败：{str(e)}")
        exit()

    # 配置颜色参数（关键修复）
    sentiment_colors = {
        'Positive': {'colormap': 'Greens', 'color': '#4CAF50'},  # 绿色系
        'Negative': {'colormap': 'Reds', 'color': '#F44336'},  # 红色系
        'Neutral': {'colormap': 'Blues', 'color': '#2196F3'}  # 蓝色系
    }

    fig, axes = plt.subplots(1, 3, figsize=(30, 12), facecolor='white')

    for idx, sentiment in enumerate(target_sentiments):
        ax = axes[idx]
        sentiment_df = df[df['Sentiment'] == sentiment]

        # 文本处理
        text = ' '.join(sentiment_df['Text'].astype(str).apply(clean_text))
        if not text.strip():
            text = ' '.join(sentiment_df['Text'].astype(str))
            text = clean_text(text)

        # 词云配置
        wc_config = {
            'width': 1000 if sentiment == 'Negative' else 1200,
            'height': 800,
            'background_color': 'white',
            'max_words': 50 if sentiment == 'Negative' else 150,
            'colormap': sentiment_colors[sentiment]['colormap'],  # 使用正确的colormap名称
            'collocations': False,
            'min_font_size': 12 if sentiment == 'Negative' else 18,
            'max_font_size': 80 if sentiment == 'Negative' else 120
        }

        try:
            wc = WordCloud(**wc_config).generate(text)
            ax.imshow(wc, interpolation='bilinear')
            # 使用十六进制颜色代码设置标题颜色
            ax.set_title(f'{sentiment} (n={len(sentiment_df)})',
                         fontsize=24, pad=20,
                         color=sentiment_colors[sentiment]['color'])
        except Exception as e:
            print(f"生成{sentiment}词云失败：{str(e)}")
            ax.text(0.5, 0.5, '生成失败',
                    ha='center', va='center',
                    fontsize=20,
                    color=sentiment_colors[sentiment]['color'])
            ax.axis('off')
        else:
            ax.axis('off')
            wc.to_file(f"{sentiment.lower()}_wordcloud.png")

    plt.tight_layout()
    plt.savefig('sentiment_comparison.png', dpi=300, bbox_inches='tight')
    plt.show()