- 差异化表达分析
- 细胞共表达
- 聚类稳定性
import scanpy as sc
import pandas as pd
data_matrix = pd.read_csv("data.csv", header=None)
labels = pd.read_csv("pred_labels.csv", header=None, names=["cluster"])
data_matrix.index = [f"cell{i+1}" for i in range(data_matrix.shape[0])]
data_matrix.columns = [f"gene{i+1}" for i in range(data_matrix.shape[1])]
adata = sc.AnnData(X=data_matrix.values, obs=labels)
adata.obs['cluster'] = adata.obs['cluster'].astype('category')
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.tl.rank_genes_groups(adata, groupby="cluster", method="wilcoxon")
de_genes = sc.get.rank_genes_groups_df(adata, group=None)
de_genes.to_csv("DE_genes.csv")
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
significant = de_genes[de_genes["pvals_adj"] < 0.05]
significant = significant[abs(significant["logfoldchanges"]) > 1]
fig=plt.figure(figsize=(10, 6))
sns.scatterplot(
x=de_genes["logfoldchanges"],
y=-np.log10(de_genes["pvals_adj"]),
hue=de_genes.index.isin(significant.index),
palette={True: "red", False: "gray"},
alpha=0.6
)
plt.xlabel("log2(Fold Change)")
plt.ylabel("-log10(Adjusted p-value)")
plt.title("DE Genes Identified by Clustering")
plt.legend(title="Significant", bbox_to_anchor=(1, 1))
fig.savefig('01.tif', format='tif', dpi=600)
plt.show()
top_genes = de_genes.sort_values("padj").index[:20]
subset = adata[:, top_genes].to_df()
subset["cluster"] = adata.obs["cluster"]
subset = subset.groupby("cluster").mean().T
subset = (subset - subset.mean(axis=1)) / subset.std(axis=1)
plt.figure(figsize=(12, 8))
sns.heatmap(subset, cmap="viridis", annot=False, fmt=".2f")
plt.title("Top DE Genes Across Clusters")
plt.xlabel("Clusters")
plt.ylabel("Genes")
plt.show()
corr_matrix = np.corrcoef(adata.X, rowvar=False)
high_var_genes = adata.var_names[np.argsort(adata.X.var(axis=0))[-50:]]
subset_corr = corr_matrix[np.ix_(adata.var_names.isin(high_var_genes),
adata.var_names.isin(high_var_genes))]
fig=plt.figure(figsize=(12, 8))
plt.imshow(subset_corr, cmap='coolwarm', interpolation='none')
plt.colorbar()
plt.title("Gene Co-expression Network")
fig.savefig('01.tif', format='tif', dpi=600)
plt.show()
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(adata.X, adata.obs['cluster'])
print(f"Silhouette Score: {silhouette_avg:.3f}")
sc.tl.tsne(adata)
sc.pl.tsne(adata, color='cluster', palette='tab10', title="Clustering Stability")
plt.gcf().savefig('03.tif', format='tif', dpi=600)
plt.show()