머신러닝 - K-최근접 이웃(KNN classifier)을 이용한 분류

December 25, 2022 4 분 소요

KNN 2진분류 (Binary Classification)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

데이터 준비

# iris 데이터 사용
df = sns.load_dataset('iris')
df.shape

(150, 5)

df.head()

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa

# Label(예측값) 개수 확인
df['species'].value_counts()

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

머신러닝 분류를 위해 label 값을 정수로 변환

# 라벨 인코딩
df['species'] = df['species'].map({
    'setosa': 0,
    'versicolor': 1,
    'virginica': 2
})

df.head()

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

학습데이터(X)와 Label(y) 분리

# X, y = df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']], df['species']
# X, y = df.iloc[:, :-1], df.iloc[:, -1]

X, y = df.drop('species', axis=1), df['species']
X.head()

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: species, Length: 150, dtype: int64

테스트 데이터 분리

# 사이킷런 설치
# !pip install scikit-learn

import sklearn

# 데이터 분리를 위해 데이터 섞기
X, y = sklearn.utils.shuffle(X, y)

# 8:2 = train:test 분리
num = int(len(X)*0.8)

X_train, y_train = X.iloc[:num], y.iloc[:num]
X_test, y_test = X.iloc[num:], y.iloc[num:]

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

# 사이킷런 함수
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2022)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(120, 4) (120,)
(30, 4) (30,)
(120, 4) (120,)
(30, 4) (30,)

X_train.describe().T

	count	mean	std	min	25%	50%	75%	max
sepal_length	120.0	5.862500	0.835862	4.3	5.100	5.8	6.400	7.9
sepal_width	120.0	3.036667	0.444563	2.0	2.800	3.0	3.300	4.4
petal_length	120.0	3.811667	1.783358	1.1	1.575	4.4	5.125	6.9
petal_width	120.0	1.216667	0.756966	0.1	0.300	1.3	1.800	2.5

정규화

표준화
최소-최대 정규화

1. 표준화

# 직접 구현
# mu = X_train.mean()
# std = X_train.std()
# X_train = (X_train - mu) / std
# X_test = (X_test - mu) / std

# 사이킷런 함수 사용
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

데이터 함수 정리

def get_iris():
    df = sns.load_dataset('iris')
    
    # 라벨 인코딩
    df['species'] = df['species'].map({
        'setosa': 0,
        'versicolor': 1,
        'virginica': 2
    })
    
    # 학습할 데이터와 Label 분리
    X, y = df.drop('species', axis=1), df['species']
    
    # train, test 분리 (8:2)
    X, y = sklearn.utils.shuffle(X, y)
    
    num = int(len(X)*0.8)
    
    X_train, y_train = X.iloc[:num], y.iloc[:num]
    X_test, y_test = X.iloc[num:], y.iloc[num:]
    
    mu = X_train.mean()
    std = X_train.std()
    X_train = (X_train - mu) / std
    X_test = (X_test - mu) / std
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_iris()

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(120, 4) (120,)
(30, 4) (30,)

2진 데이터 준비

iris = pd.read_csv('iris.csv')
iris.head()

	Id	SepalLengthCm	SepalWidthCm	PetalLengthCm	PetalWidthCm	Species
0	1	5.1	3.5	1.4	0.2	Iris-setosa
1	2	4.9	3.0	1.4	0.2	Iris-setosa
2	3	4.7	3.2	1.3	0.2	Iris-setosa
3	4	4.6	3.1	1.5	0.2	Iris-setosa
4	5	5.0	3.6	1.4	0.2	Iris-setosa

def get_iris(mode=None):
    df = iris.drop('Id', axis=1).copy()
    
    if mode == 'bin':
        df = df[df['Species'] != 'Iris-virginica']
    
    df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
    df['species'] = df['species'].map({
        'Iris-setosa': 0,
        'Iris-versicolor': 1,
        'Iris-virginica': 2
    })

    X, y = df.drop('species', axis=1), df['species']

    X, y = sklearn.utils.shuffle(X, y, random_state=2022)

    num = int(len(X)*0.8)

    X_train, y_train = X.iloc[:num], y.iloc[:num]
    X_test, y_test = X.iloc[num:], y.iloc[num:]

    mu = X_train.mean()
    std = X_train.std()
    X_train = (X_train - mu) / std
    X_test = (X_test - mu) / std
    
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = get_iris('bin')

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)

(80, 4) (80,)
(20, 4) (20,)

X_train = X_train.values
y_train = y_train.values
X_test = X_test.values
y_test = y_test.values

모델 학습 (KNN)

from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier()

# 학습
clf.fit(X_train, y_train)

# 평가
clf.score(X_train, y_train), clf.score(X_test, y_test)

(1.0, 1.0)

# 예측
y_pred = clf.predict(X_test)
y_pred

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1],
      dtype=int64)

y_test

array([0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1],
      dtype=int64)

y_pred, y_test 정확도 계산

# 직접 구현
acc1 = (y_pred == y_test).sum() / len(y_test)

# sklearn 함수 사용
from sklearn.metrics import accuracy_score, precision_score, recall_score # 정확도, 정밀도, 재현율

acc2 = accuracy_score(y_test, y_pred)
pre = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

print(acc1, acc2)
print(pre)
print(rec)

1.0 1.0
1.0
1.0

def print_score(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    pre = precision_score(y_true, y_pred)
    rec = recall_score(y_true, y_pred)

    print('accuracy:', acc)
    print('precision:', pre)
    print('recall:', rec)
print_score(y_test, y_pred)

accuracy: 1.0
precision: 1.0
recall: 1.0

혼동행렬 (Confusion Matrix)

from sklearn.metrics import confusion_matrix

cfm = confusion_matrix(y_test, y_pred)
cfm

array([[13,  0],
       [ 0,  7]], dtype=int64)

# 시각화
sns.heatmap(cfm, annot=True)
plt.xlabel('Predicted Class')
plt.ylabel('True Class')
plt.show()

Twitter Facebook LinkedIn

머신러닝 - K-최근접 이웃(KNN classifier)을 이용한 분류

KNN 2진분류 (Binary Classification)

데이터 준비

테스트 데이터 분리

정규화

데이터 함수 정리

2진 데이터 준비

모델 학습 (KNN)

y_pred, y_test 정확도 계산

혼동행렬 (Confusion Matrix)

공유하기

댓글남기기

참고

DALL-E 2 사용법 (사용기), 텍스트로 이미지를 만드는 인공지능

구글 드라이브 파일 다운받는 gdown 사용법과 안될 시 해결법

스테이블 디퓨전(Stable Diffusion) 간단한 사용법과 가이드 및 원리 이해 by 코랩(colab)

머신러닝 - 결정 트리(Decision) 쉽게 이해하고 구현하기

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2