import pandas as pd
data = pd.read_csv("../data/第2章数据/diabetes.csv",index_col=0)
Index = data.columns
xtitle =[index for index in Index if'x.'in index]
x2title =[index for index in Index if'x2.'in index]
xdata = data[xtitle]
x2data = data[x2title]
ydata = data['y']
import statsmodels.api as sm
import matplotlib.pyplot as plt
import scipy
X = sm.add_constant(x2data,prepend=True)
lm = sm.OLS(ydata,X)
lm_result = lm.fit()# dir(lm_result)# lm_result.summary()
y_hat = lm_result.fittedvalues
res = lm_result.resid
plt.figure()
plt.plot(y_hat,res,'.k')#错误写法'''
# plt.xlabel='yhat' # Set x-axis label
# plt.ylabel='residuals' # Set y-axis label
# plt.title='residuals vs yhat' # Set title
'''#正确写法
plt.xlabel('yhat')# Set x-axis label
plt.ylabel('residuals')# Set y-axis label
plt.title('residuals vs yhat')# Set title
plt.show()
W,p_value=scipy.stats.shapiro(res)
W,p_value
(0.9937732815742493, 0.06650751084089279)
例2.2
import numpy as np
defkappa(x):
x = np.array(x)
XX = np.dot(x.T,x)
lam = np.linalg.eigvals(XX)return(np.sqrt(lam.max()/lam.min()))
kappa(xdata)
21.68154463827331
import matplotlib.pyplot as plt
from sklearn import linear_model
#路径求解
n_alphas =200
alphas = np.logspace(-5,3, n_alphas)
coefs =[]for a in alphas:
ridge = linear_model.Ridge(alpha=a, fit_intercept=False)
ridge.fit(xdata, ydata)
coefs.append(ridge.coef_)
reg = linear_model.RidgeCV(alphas=np.logspace(-6,6,13))
reg.fit(xdata, ydata)
reg.alpha_
0.01
# Plot Ridge coefficients
ax = plt.gca()
ax.plot(alphas, coefs, label=xdata.columns)
ax.set_xscale("log")# x-axis in log scale
ax.set_xlim(ax.get_xlim())
ax.legend(loc='upper right')# Set label to the right
plt.axvline(reg.alpha_, linestyle="--", color="black", label='alpha: CV estimate')#设置坐标轴标签
plt.xlabel('alpha')
plt.ylabel('weights')
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()
例2.3
from sklearn.linear_model import LassoCV
lasso = linear_model.LassoCV(cv=4).fit(X,ydata)
n_alphas=20
alphas=np.logspace(-2,1,n_alphas)
clf=linear_model.Lasso(fit_intercept=False)
clf.fit(xdata,ydata)
coefs=[]for a in alphas:
clf.set_params(alpha=a)
clf.fit(xdata,ydata)
coefs.append(clf.coef_)# Plot Ridge coefficients
ax = plt.gca()
ax.plot(alphas, coefs, label=xdata.columns)
ax.set_xscale("log")# x-axis in log scale
ax.set_xlim(ax.get_xlim())
ax.legend(loc='upper right')# Set label to the right
plt.axvline(reg.alpha_, linestyle="--", color="black", label='alpha: CV estimate')
plt.xlabel('alpha')# Set x-axis label
plt.ylabel('weights')# Set y-axis label
plt.title('Ridge coefficients as a function of the regularization')
plt.axis('tight')
plt.show()