import numpy as np
import pandas as pd
from sklearn. datasets import load_iris
from sklearn. model_selection import train_test_split
from sklearn. metrics import accuracy_score
1.数据预处理
iris = load_iris( )
df = pd. DataFrame( data= iris. data, columns= iris. feature_names)
df[ 'class' ] = iris. target
df[ 'class' ] = df[ 'class' ] . map ( { 0 : iris. target_names[ 0 ] , 1 : iris. target_names[ 1 ] , 2 : iris. target_names[ 2 ] } )
df. head( )
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) class 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa 2 4.7 3.2 1.3 0.2 setosa 3 4.6 3.1 1.5 0.2 setosa 4 5.0 3.6 1.4 0.2 setosa
x = iris. data
y = iris. target. reshape( - 1 , 1 )
print ( "x shape: " , x. shape)
print ( "y shape: " , y. shape)
x shape: (150, 4)
y shape: (150, 1)
x_train, x_test, y_train, y_test = train_test_split( x,
y, test_size= 0.3 , random_state= 42 , stratify= y)
2. 模型实现
def l1_distance ( a, b) :
return np. sum ( np. abs ( a - b) , axis = 1 )
def l2_distance ( a, b) :
return np. sqrt( np. sum ( ( a - b) ** 2 , axis = 1 ) )
class KnnModel ( object ) :
def __init__ ( self, k_neighbors = 1 , distance_func = l1_distance) :
self. k_neighbors = k_neighbors;
self. distance_func = distance_func
def fit ( self, x, y) :
self. x_train = x
self. y_train = y
def predict ( self, test) :
y_predict = np. zeros( ( test. shape[ 0 ] , 1 ) , dtype= self. y_train. dtype)
for i, x_test in enumerate ( test) :
distances = self. distance_func( self. x_train, x_test)
sort_index = np. argsort( distances)
neighbors_predict = self. y_train[ sort_index[ : self. k_neighbors] ] . ravel( )
y_predict[ i] = np. argmax( np. bincount( neighbors_predict) )
return y_predict
3.测试
knn = KnnModel( k_neighbors = 9 )
knn. fit( x_train, y_train) ;
result_list = [ ]
for df in [ 1 , 2 ] :
knn. distance_func = l1_distance if pd == 1 else l2_distance
for k in range ( 1 , 20 , 2 ) :
knn. k_neighbors = k
y_predict = knn. predict( x_test)
acc = accuracy_score( y_test, y_predict) * 100
result_list. append( [ k, 'l1_dist' if df == 1 else 'l2_dist' , acc] )
result_df = pd. DataFrame( result_list, columns= [ 'k' , '距离函数' , '准确率' ] )
print ( result_df)
k 距离函数 准确率
0 1 l1_dist 93.333333
1 3 l1_dist 95.555556
2 5 l1_dist 97.777778
3 7 l1_dist 95.555556
4 9 l1_dist 95.555556
5 11 l1_dist 93.333333
6 13 l1_dist 93.333333
7 15 l1_dist 95.555556
8 17 l1_dist 95.555556
9 19 l1_dist 95.555556
10 1 l2_dist 93.333333
11 3 l2_dist 95.555556
12 5 l2_dist 97.777778
13 7 l2_dist 95.555556
14 9 l2_dist 95.555556
15 11 l2_dist 93.333333
16 13 l2_dist 93.333333
17 15 l2_dist 95.555556
18 17 l2_dist 95.555556
19 19 l2_dist 95.555556