4 분 소요

KNN 2진분류 (Binary Classification)


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

데이터 준비

# iris 데이터 사용
df = sns.load_dataset('iris')
df.shape
(150, 5)
df.head()
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa
# Label(예측값) 개수 확인
df['species'].value_counts()
setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

 

머신러닝 분류를 위해 label 값을 정수로 변환

# 라벨 인코딩
df['species'] = df['species'].map({
    'setosa': 0,
    'versicolor': 1,
    'virginica': 2
})
df.head()
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 0
1 4.9 3.0 1.4 0.2 0
2 4.7 3.2 1.3 0.2 0
3 4.6 3.1 1.5 0.2 0
4 5.0 3.6 1.4 0.2 0

 

학습데이터(X)와 Label(y) 분리

# X, y = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], df['species']
# X, y = df.iloc[:, :-1], df.iloc[:, -1]

X, y = df.drop('species', axis=1), df['species']
X.head()
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
y
0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: species, Length: 150, dtype: int64

테스트 데이터 분리

# 사이킷런 설치
# !pip install scikit-learn
import sklearn

# 데이터 분리를 위해 데이터 섞기
X, y = sklearn.utils.shuffle(X, y)

# 8:2 = train:test 분리
num = int(len(X)*0.8)

X_train, y_train = X.iloc[:num], y.iloc[:num]
X_test, y_test = X.iloc[num:], y.iloc[num:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# 사이킷런 함수
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(120, 4) (120,)
(30, 4) (30,)
(120, 4) (120,)
(30, 4) (30,)
X_train.describe().T
count mean std min 25% 50% 75% max
sepal_length 120.0 5.862500 0.835862 4.3 5.100 5.8 6.400 7.9
sepal_width 120.0 3.036667 0.444563 2.0 2.800 3.0 3.300 4.4
petal_length 120.0 3.811667 1.783358 1.1 1.575 4.4 5.125 6.9
petal_width 120.0 1.216667 0.756966 0.1 0.300 1.3 1.800 2.5

정규화

  • 표준화

  • 최소-최대 정규화

1. 표준화

# 직접 구현
# mu = X_train.mean()
# std = X_train.std()
# X_train = (X_train - mu) / std
# X_test = (X_test - mu) / std

# 사이킷런 함수 사용
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

 

데이터 함수 정리

def get_iris():
    df = sns.load_dataset('iris')
    
    # 라벨 인코딩
    df['species'] = df['species'].map({
        'setosa': 0,
        'versicolor': 1,
        'virginica': 2
    })
    
    # 학습할 데이터와 Label 분리
    X, y = df.drop('species', axis=1), df['species']
    
    # train, test 분리 (8:2)
    X, y = sklearn.utils.shuffle(X, y)
    
    num = int(len(X)*0.8)
    
    X_train, y_train = X.iloc[:num], y.iloc[:num]
    X_test, y_test = X.iloc[num:], y.iloc[num:]
    
    mu = X_train.mean()
    std = X_train.std()
    X_train = (X_train - mu) / std
    X_test = (X_test - mu) / std
    
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_iris()

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(120, 4) (120,)
(30, 4) (30,)

2진 데이터 준비


iris = pd.read_csv('iris.csv')
iris.head()
Id SepalLengthCm SepalWidthCm PetalLengthCm PetalWidthCm Species
0 1 5.1 3.5 1.4 0.2 Iris-setosa
1 2 4.9 3.0 1.4 0.2 Iris-setosa
2 3 4.7 3.2 1.3 0.2 Iris-setosa
3 4 4.6 3.1 1.5 0.2 Iris-setosa
4 5 5.0 3.6 1.4 0.2 Iris-setosa
def get_iris(mode=None):
    df = iris.drop('Id', axis=1).copy()
    
    if mode == 'bin':
        df = df[df['Species'] != 'Iris-virginica']
    
    df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
    df['species'] = df['species'].map({
        'Iris-setosa': 0,
        'Iris-versicolor': 1,
        'Iris-virginica': 2
    })

    X, y = df.drop('species', axis=1), df['species']

    X, y = sklearn.utils.shuffle(X, y, random_state=2022)

    num = int(len(X)*0.8)

    X_train, y_train = X.iloc[:num], y.iloc[:num]
    X_test, y_test = X.iloc[num:], y.iloc[num:]

    mu = X_train.mean()
    std = X_train.std()
    X_train = (X_train - mu) / std
    X_test = (X_test - mu) / std
    
    return X_train, X_test, y_train, y_test
X_train, X_test, y_train, y_test = get_iris('bin')

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
(80, 4) (80,)
(20, 4) (20,)
X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

모델 학습 (KNN)

from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

# 학습
clf.fit(X_train, y_train)
# 평가
clf.score(X_train, y_train), clf.score(X_test, y_test)
(1.0, 1.0)
# 예측
y_pred = clf.predict(X_test)
y_pred
array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1],
      dtype=int64)
y_test
array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1],
      dtype=int64)

y_pred, y_test 정확도 계산

# 직접 구현
acc1 = (y_pred == y_test).sum() / len(y_test)

# sklearn 함수 사용
from sklearn.metrics import accuracy_score, precision_score, recall_score # 정확도, 정밀도, 재현율

acc2 = accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc1, acc2)
print(pre)
print(rec)
1.0 1.0
1.0
1.0
def print_score(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)

    print('accuracy:', acc)
    print('precision:', pre)
    print('recall:', rec)
print_score(y_test, y_pred)
accuracy: 1.0
precision: 1.0
recall: 1.0

혼동행렬 (Confusion Matrix)

from sklearn.metrics import confusion_matrix

cfm = confusion_matrix(y_test, y_pred)
cfm
array([[13,  0],
       [ 0,  7]], dtype=int64)
# 시각화
sns.heatmap(cfm, annot=True)
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

댓글남기기