问卷调查系统Two-Step-Kmeans-前端后端搭建完成
主要功能
- 主页展示:通过
home
视图函数展示主页面,并列出所有已上传的CSV文件供用户选择。 - 文件上传:允许用户上传新的CSV文件到服务器。确保文件为CSV格式,并根据上传时间生成唯一的文件名保存。
- 数据预处理:对选定的CSV文件进行预处理,包括删除不需要的列、处理缺失值等,并将处理后的数据保存为新文件。
- 执行聚类分析:基于选定的特征对数据进行标准化、降维(PCA),然后进行两阶段KMeans聚类分析。计算并返回最终轮廓系数以评估聚类质量。
- 获取聚类中心:提供接口获取最后一次聚类分析的聚类中心信息。
实现步骤
-
前端页面 (
index.html
):- 提供表单让用户上传文件、选择文件进行预处理、运行数据分析和获取聚类中心。
- 使用jQuery简化与后端API的交互,如文件上传、异步请求处理结果等。
-
后端逻辑 (Django视图函数):
home
: 渲染主页,列出所有可用的CSV文件。upload_file
: 处理文件上传请求,验证文件类型并保存。preprocess_data
: 对指定的CSV文件进行数据清洗和预处理。perform_clustering
: 执行两阶段聚类分析,返回最终轮廓系数。get_cluster_centers
: 返回最后一次聚类分析的聚类中心信息。
-
URL映射: 定义了各个视图函数对应的URL路径,使前端能够正确调用相应的后端服务。
关键技术点
- Django框架: 用于快速开发Web应用的基础架构。
- Pandas库: 数据处理的核心工具,支持高效的数据操作和分析。
- Scikit-learn库: 提供机器学习算法,如KMeans聚类、PCA降维等。
- jQuery: 简化AJAX请求,提升用户体验。
整体代码:
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>数据处理与分析</title>
<script src="https://code.jquery.com/jquery-3.6.0.min.js"></script>
</head>
<body>
<h2>上传CSV文件</h2>
<form id="upload-form" enctype="multipart/form-data">
{% csrf_token %}
<input type="file" name="file" required>
<button type="submit">上传</button>
</form>
<div id="upload-status"></div>
<h2>选择文件进行预处理</h2>
<form id="preprocess-form">
{% csrf_token %}
<label for="filename-preprocess">选择文件:</label>
<select name="filename" id="filename-preprocess" required>
<option value="">--请选择文件--</option>
{% for file in files %}
<option value="{
{ file }}">{
{ file }}</option>
{% endfor %}
</select>
<button type="submit">处理数据</button>
</form>
<div id="preprocess-status"></div>
<h2>运行数据分析</h2>
<form id="analysis-form">
{% csrf_token %}
<label for="filename-analysis">选择文件:</label>
<select name="filename" id="filename-analysis" required>
<option value="">--请选择文件--</option>
{% for file in files %}
<option value="{
{ file }}">{
{ file }}</option>
{% endfor %}
</select>
<button type="submit">运行分析</button>
</form>
<div id="analysis-status"></div>
<h2>获取聚类中心</h2>
<form id="cluster-centers-form">
{% csrf_token %}
<label for="filename-cluster-centers">选择文件:</label>
<select name="filename" id="filename-cluster-centers" required>
<option value="">--请选择文件--</option>
{% for file in files %}
<option value="{
{ file }}">{
{ file }}</option>
{% endfor %}
</select>
<button type="submit">获取聚类中心</button>
</form>
<div id="cluster-centers-status"></div>
<script>
$(document).ready(function(){
// 上传文件
$('#upload-form').on('submit', function(event){
event.preventDefault();
var formData = new FormData(this);
$.ajax({
url: '{% url "upload_file" %}',
type: 'POST',
data: formData,
processData: false,
contentType: false,
success: function(response){
$('#upload-status').text(response.message);
// 更新下拉菜单选项
$('#filename-preprocess, #filename-analysis, #filename-cluster-centers').append($('<option>', {
value: response.filename,
text: response.filename
}));
},
error: function(xhr, status, error){
$('#upload-status').text('上传失败: ' + xhr.responseText);
}
});
});
// 数据预处理
$('#preprocess-form').on('submit', function(event){
event.preventDefault();
var formData = $(this).serialize();
$.ajax({
url: '{% url "preprocess_data" %}',
type: 'POST',
data: formData,
success: function(response){
$('#preprocess-status').text(response.message);
},
error: function(xhr, status, error){
$('#preprocess-status').text('处理失败: ' + xhr.responseText);
}
});
});
// 运行分析
$('#analysis-form').on('submit', function(event){
event.preventDefault();
var formData = $(this).serialize();
$.ajax({
url: '{% url "perform_clustering" %}', // 使用新的视图函数
type: 'POST',
data: formData,
success: function(response){
$('#analysis-status').html(`<p>${response.message}</p>`);
},
error: function(error){
console.log('Error:', error);
$('#analysis-status').text('分析失败');
}
});
});
// 获取聚类中心
$('#cluster-centers-form').on('submit', function(event){
event.preventDefault();
var formData = $(this).serialize();
$.ajax({
url: '{% url "get_cluster_centers" %}', // 使用新的视图函数
type: 'POST',
data: formData,
success: function(response){
let centers = response.cluster_centers;
let message = "聚类中心:\n";
centers.forEach(center => {
message += JSON.stringify(center) + "\n";
});
$('#cluster-centers-status').html(message);
},
error: function(error){
console.log('Error:', error);
$('#cluster-centers-status').text('获取聚类中心失败');
}
});
});
});
</script>
</body>
</html>
#view.py
from django.http import JsonResponse
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from django.http import JsonResponse, HttpResponse
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from django.shortcuts import render
import os
from django.conf import settings
def home(request):
return render(request, 'index.html')
import os
from django.shortcuts import render
from django.http import JsonResponse, HttpResponse
from django.conf import settings
from datetime import datetime
import os
from django.shortcuts import render
from django.http import JsonResponse, HttpResponse
import pandas as pd
from django.conf import settings
from datetime import datetime
def home(request):
# 获取所有上传的CSV文件列表供用户选择
all_files = [f for f in os.listdir(settings.BASE_DIR) if f.endswith('.csv')]
return render(request, 'index.html', {'files': all_files})
def upload_file(request):
if request.method == 'POST' and request.FILES.get('file'):
uploaded_file = request.FILES['file']
# 确保文件是CSV格式
if not uploaded_file.name.endswith('.csv'):
return HttpResponse("仅支持CSV文件", status=400)
# 使用当前时间戳和原始文件名来创建唯一的文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
unique_filename = f"{timestamp}_{uploaded_file.name}"
# 定义保存文件的路径
save_path = os.path.join(settings.BASE_DIR, unique_filename)
# 保存文件到指定位置
with open(save_path, 'wb+') as destination:
for chunk in uploaded_file.chunks():
destination.write(chunk)
return JsonResponse({"message": "文件上传成功", "filename": unique_filename})
else:
return JsonResponse({"error": "无效请求"}, status=400)
from datetime import datetime
def preprocess_data(request):
if request.method == 'POST':
filename = request.POST.get('filename')
if not filename:
return JsonResponse({"error": "请选择一个文件"}, status=400)
file_path = os.path.join(settings.BASE_DIR, filename)
try:
df = pd.read_csv(file_path)
# 删除不需要的列
columns_to_drop = ['答题序号', '开始时间', '提交时间', '答题时长',
'IP省份', 'IP城市', 'IP地址', '浏览器', '操作系统', 'Q5|open']
df.drop(columns=columns_to_drop, inplace=True, errors='ignore')
# 删除包含缺失值的行
df.dropna(inplace=True)
# 删除特定列
df.drop(columns=['Q1', '来源'], inplace=True, errors='ignore')
# 创建处理后的文件名
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
processed_filename = f"processed_{timestamp}_{filename}"
processed_file_path = os.path.join(settings.BASE_DIR, processed_filename)
# 保存处理后的DataFrame到新的CSV文件
df.to_csv(processed_file_path, index=False)
return JsonResponse({"message": f"数据预处理完成并保存为 {processed_filename}"})
except FileNotFoundError:
return JsonResponse({"error": f"无法找到文件 {filename}"}, status=404)
except Exception as e:
return JsonResponse({"error": str(e)}, status=500)
else:
return JsonResponse({"error": "无效请求"}, status=400)
from django.http import JsonResponse
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import os
from django.conf import settings
from django.http import JsonResponse
import os
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from django.conf import settings
def perform_clustering(request):
if request.method == 'POST':
filename = request.POST.get('filename')
if not filename:
return JsonResponse({"error": "请选择一个文件"}, status=400)
file_path = os.path.join(settings.BASE_DIR, filename)
try:
df = pd.read_csv(file_path)
selected_features = ['Q10', 'Q12', 'Q13', 'Q14']
df.dropna(subset=selected_features, inplace=True)
# 数据标准化
scaler = StandardScaler()
df_normalized = scaler.fit_transform(df[selected_features])
# 使用PCA进行降维
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df_normalized)
# 第一阶段聚类:尝试不同的簇数并计算轮廓系数
silhouette_scores = []
cluster_range = range(2, 11)
for k in cluster_range:
kmeans = KMeans(n_clusters=k, random_state=42)
clusters = kmeans.fit_predict(df_pca)
silhouette_avg = silhouette_score(df_pca, clusters)
silhouette_scores.append(silhouette_avg)
best_k_first_stage = cluster_range[silhouette_scores.index(max(silhouette_scores))]
# 使用最佳簇数进行第一次聚类
kmeans_first_stage = KMeans(n_clusters=best_k_first_stage, random_state=42)
clusters_first_stage = kmeans_first_stage.fit_predict(df_pca)
# 将第一次聚类结果作为一个新特征添加到原始数据框中
df['Cluster_First_Stage'] = clusters_first_stage
# 再次标准化(包含第一次聚类的结果)
all_features = selected_features + ['Cluster_First_Stage']
df_all_normalized = scaler.fit_transform(df[all_features])
# 第二次聚类,强制分为3类
final_k = 3
kmeans_final = KMeans(n_clusters=final_k, random_state=42)
clusters_final = kmeans_final.fit_predict(df_all_normalized)
# 计算第二次聚类的轮廓系数
final_silhouette_score = silhouette_score(df_all_normalized, clusters_final)
result = {
"message": f"聚类分析已完成。",
"final_silhouette_score": final_silhouette_score
}
return JsonResponse(result)
except FileNotFoundError:
return JsonResponse({"error": f"无法找到文件 {filename}"}, status=404)
except Exception as e:
return JsonResponse({"error": str(e)}, status=500)
else:
return JsonResponse({"error": "无效请求"}, status=400)
def get_cluster_centers(request):
if request.method == 'POST':
try:
cluster_centers = request.session.get('cluster_centers')
clusters_final = request.session.get('clusters_final')
if cluster_centers is None or clusters_final is None:
return JsonResponse({"error": "请先执行聚类分析"}, status=400)
result = {
"cluster_centers": cluster_centers,
"final_clusters": clusters_final
}
return JsonResponse(result)
except Exception as e:
return JsonResponse({"error": str(e)}, status=500)
else:
return JsonResponse({"error": "无效请求"}, status=400)
#url.py
from django.urls import path
from home import views
urlpatterns = [
path('', views.home, name='home'), # 添加这一行来映射到主页面
path('upload_file/', views.upload_file, name='upload_file'),
path('preprocess_data/', views.preprocess_data, name='preprocess_data'),
path('perform_clustering/', views.perform_clustering, name='perform_clustering'),
path('get_cluster_centers/', views.get_cluster_centers, name='get_cluster_centers'),
# path('get-cluster-centers/', views.get_cluster_centers, name='get_cluster_centers'),
]