머신러닝 - K-최근접 이웃(KNN classifier)을 이용한 분류
KNN 2진분류 (Binary Classification)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
데이터 준비
# iris 데이터 사용
df = sns.load_dataset('iris')
df.shape
(150, 5)
df.head()
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
# Label(예측값) 개수 확인
df['species'].value_counts()
setosa 50 versicolor 50 virginica 50 Name: species, dtype: int64
머신러닝 분류를 위해 label 값을 정수로 변환
# 라벨 인코딩
df['species'] = df['species'].map({
'setosa': 0,
'versicolor': 1,
'virginica': 2
})
df.head()
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | 0 |
1 | 4.9 | 3.0 | 1.4 | 0.2 | 0 |
2 | 4.7 | 3.2 | 1.3 | 0.2 | 0 |
3 | 4.6 | 3.1 | 1.5 | 0.2 | 0 |
4 | 5.0 | 3.6 | 1.4 | 0.2 | 0 |
학습데이터(X)와 Label(y) 분리
# X, y = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], df['species']
# X, y = df.iloc[:, :-1], df.iloc[:, -1]
X, y = df.drop('species', axis=1), df['species']
X.head()
sepal_length | sepal_width | petal_length | petal_width | |
---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 |
1 | 4.9 | 3.0 | 1.4 | 0.2 |
2 | 4.7 | 3.2 | 1.3 | 0.2 |
3 | 4.6 | 3.1 | 1.5 | 0.2 |
4 | 5.0 | 3.6 | 1.4 | 0.2 |
y
0 0 1 0 2 0 3 0 4 0 .. 145 2 146 2 147 2 148 2 149 2 Name: species, Length: 150, dtype: int64
테스트 데이터 분리
# 사이킷런 설치
# !pip install scikit-learn
import sklearn
# 데이터 분리를 위해 데이터 섞기
X, y = sklearn.utils.shuffle(X, y)
# 8:2 = train:test 분리
num = int(len(X)*0.8)
X_train, y_train = X.iloc[:num], y.iloc[:num]
X_test, y_test = X.iloc[num:], y.iloc[num:]
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
# 사이킷런 함수
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(120, 4) (120,) (30, 4) (30,) (120, 4) (120,) (30, 4) (30,)
X_train.describe().T
count | mean | std | min | 25% | 50% | 75% | max | |
---|---|---|---|---|---|---|---|---|
sepal_length | 120.0 | 5.862500 | 0.835862 | 4.3 | 5.100 | 5.8 | 6.400 | 7.9 |
sepal_width | 120.0 | 3.036667 | 0.444563 | 2.0 | 2.800 | 3.0 | 3.300 | 4.4 |
petal_length | 120.0 | 3.811667 | 1.783358 | 1.1 | 1.575 | 4.4 | 5.125 | 6.9 |
petal_width | 120.0 | 1.216667 | 0.756966 | 0.1 | 0.300 | 1.3 | 1.800 | 2.5 |
정규화
-
표준화
-
최소-최대 정규화
1. 표준화
# 직접 구현
# mu = X_train.mean()
# std = X_train.std()
# X_train = (X_train - mu) / std
# X_test = (X_test - mu) / std
# 사이킷런 함수 사용
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
데이터 함수 정리
def get_iris():
df = sns.load_dataset('iris')
# 라벨 인코딩
df['species'] = df['species'].map({
'setosa': 0,
'versicolor': 1,
'virginica': 2
})
# 학습할 데이터와 Label 분리
X, y = df.drop('species', axis=1), df['species']
# train, test 분리 (8:2)
X, y = sklearn.utils.shuffle(X, y)
num = int(len(X)*0.8)
X_train, y_train = X.iloc[:num], y.iloc[:num]
X_test, y_test = X.iloc[num:], y.iloc[num:]
mu = X_train.mean()
std = X_train.std()
X_train = (X_train - mu) / std
X_test = (X_test - mu) / std
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_iris()
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(120, 4) (120,) (30, 4) (30,)
2진 데이터 준비
iris = pd.read_csv('iris.csv')
iris.head()
Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
---|---|---|---|---|---|---|
0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
def get_iris(mode=None):
df = iris.drop('Id', axis=1).copy()
if mode == 'bin':
df = df[df['Species'] != 'Iris-virginica']
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
df['species'] = df['species'].map({
'Iris-setosa': 0,
'Iris-versicolor': 1,
'Iris-virginica': 2
})
X, y = df.drop('species', axis=1), df['species']
X, y = sklearn.utils.shuffle(X, y, random_state=2022)
num = int(len(X)*0.8)
X_train, y_train = X.iloc[:num], y.iloc[:num]
X_test, y_test = X.iloc[num:], y.iloc[num:]
mu = X_train.mean()
std = X_train.std()
X_train = (X_train - mu) / std
X_test = (X_test - mu) / std
return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_iris('bin')
print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(80, 4) (80,) (20, 4) (20,)
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values
모델 학습 (KNN)
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
# 학습
clf.fit(X_train, y_train)
# 평가
clf.score(X_train, y_train), clf.score(X_test, y_test)
(1.0, 1.0)
# 예측
y_pred = clf.predict(X_test)
y_pred
array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1], dtype=int64)
y_test
array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1], dtype=int64)
y_pred, y_test 정확도 계산
# 직접 구현
acc1 = (y_pred == y_test).sum() / len(y_test)
# sklearn 함수 사용
from sklearn.metrics import accuracy_score, precision_score, recall_score # 정확도, 정밀도, 재현율
acc2 = accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)
print(acc1, acc2)
print(pre)
print(rec)
1.0 1.0 1.0 1.0
def print_score(y_true, y_pred):
acc = accuracy_score(y_true, y_pred)
pre = precision_score(y_true, y_pred)
rec = recall_score(y_true, y_pred)
print('accuracy:', acc)
print('precision:', pre)
print('recall:', rec)
print_score(y_test, y_pred)
accuracy: 1.0 precision: 1.0 recall: 1.0
혼동행렬 (Confusion Matrix)
from sklearn.metrics import confusion_matrix
cfm = confusion_matrix(y_test, y_pred)
cfm
array([[13, 0], [ 0, 7]], dtype=int64)
# 시각화
sns.heatmap(cfm, annot=True)
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()
댓글남기기