4 분 소요

데이터셋 준비

사이킷런에서 당뇨병 데이터셋을 받아옵니다.

그리고 데이터셋을 판다스의 데이터 프레임으로 변환합니다.

from sklearn.datasets import load_diabetes, load_boston,load_wine

dataset = load_diabetes()
features = dataset['data']
feature_names = dataset['feature_names']
import pandas as pd

df = pd.DataFrame(features, columns=feature_names)
df.head()
age sex bmi bp s1 s2 s3 s4 s5 s6
0 0.038076 0.050680 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019908 -0.017646
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163 0.074412 -0.039493 -0.068330 -0.092204
2 0.085299 0.050680 0.044451 -0.005671 -0.045599 -0.034194 -0.032356 -0.002592 0.002864 -0.025930
3 -0.089063 -0.044642 -0.011595 -0.036656 0.012191 0.024991 -0.036038 0.034309 0.022692 -0.009362
4 0.005383 -0.044642 -0.036385 0.021872 0.003935 0.015596 0.008142 -0.002592 -0.031991 -0.046641

데이터 정보 확인

describe(): 특징에 대한 개수(count), 평균(mean), 표준편자(std), 최소(min), 최대(max) 등을 알 수 있습니다.

info(): 특징의 null값의 유무와 Dtype을 알 수 있습니다.

nlargest(int): 값이 큰 순서대로 int값의 개수를 출력합니다.

corr(): 각 특징간 상관관계를 알려줍니다.

df.describe()
age sex bmi bp s1 s2 s3 s4 s5 s6
count 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02 4.420000e+02
mean -3.634285e-16 1.308343e-16 -8.045349e-16 1.281655e-16 -8.835316e-17 1.327024e-16 -4.574646e-16 3.777301e-16 -3.830854e-16 -3.412882e-16
std 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02 4.761905e-02
min -1.072256e-01 -4.464164e-02 -9.027530e-02 -1.123996e-01 -1.267807e-01 -1.156131e-01 -1.023071e-01 -7.639450e-02 -1.260974e-01 -1.377672e-01
25% -3.729927e-02 -4.464164e-02 -3.422907e-02 -3.665645e-02 -3.424784e-02 -3.035840e-02 -3.511716e-02 -3.949338e-02 -3.324879e-02 -3.317903e-02
50% 5.383060e-03 -4.464164e-02 -7.283766e-03 -5.670611e-03 -4.320866e-03 -3.819065e-03 -6.584468e-03 -2.592262e-03 -1.947634e-03 -1.077698e-03
75% 3.807591e-02 5.068012e-02 3.124802e-02 3.564384e-02 2.835801e-02 2.984439e-02 2.931150e-02 3.430886e-02 3.243323e-02 2.791705e-02
max 1.107267e-01 5.068012e-02 1.705552e-01 1.320442e-01 1.539137e-01 1.987880e-01 1.811791e-01 1.852344e-01 1.335990e-01 1.356118e-01
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 10 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   age     442 non-null    float64
 1   sex     442 non-null    float64
 2   bmi     442 non-null    float64
 3   bp      442 non-null    float64
 4   s1      442 non-null    float64
 5   s2      442 non-null    float64
 6   s3      442 non-null    float64
 7   s4      442 non-null    float64
 8   s5      442 non-null    float64
 9   s6      442 non-null    float64
dtypes: float64(10)
memory usage: 34.7 KB
df['bmi'].nlargest(3)
367    0.170555
256    0.160855
366    0.137143
Name: bmi, dtype: float64
df.corr()
age bmi bp s1 s2 s3 s4 s5 s6 label
age 1.000000 0.185085 0.335427 0.260061 0.219243 -0.075181 0.203841 0.270777 0.301731 0.006930
bmi 0.185085 1.000000 0.395415 0.249777 0.261170 -0.366811 0.413807 0.446159 0.388680 0.116859
bp 0.335427 0.395415 1.000000 0.242470 0.185558 -0.178761 0.257653 0.393478 0.390429 0.048823
s1 0.260061 0.249777 0.242470 1.000000 0.896663 0.051519 0.542207 0.515501 0.325717 0.008234
s2 0.219243 0.261170 0.185558 0.896663 1.000000 -0.196455 0.659817 0.318353 0.290600 0.043795
s3 -0.075181 -0.366811 -0.178761 0.051519 -0.196455 1.000000 -0.738493 -0.398577 -0.273697 -0.086296
s4 0.203841 0.413807 0.257653 0.542207 0.659817 -0.738493 1.000000 0.617857 0.417212 0.062967
s5 0.270777 0.446159 0.393478 0.515501 0.318353 -0.398577 0.617857 1.000000 0.464670 0.015286
s6 0.301731 0.388680 0.390429 0.325717 0.290600 -0.273697 0.417212 0.464670 1.000000 -0.004950
label 0.006930 0.116859 0.048823 0.008234 0.043795 -0.086296 0.062967 0.015286 -0.004950 1.000000

함수 적용

apply()

def sortsex(x):
  if x > 0:
    return 'male'
  else:
    return 'female'
df['sex'] = df['sex'].apply(sortsex)
df['sex']
0        male
1      female
2        male
3      female
4      female
        ...  
437      male
438      male
439      male
440    female
441    female
Name: sex, Length: 442, dtype: object
import numpy as np
df['label'] = np.random.randint(1, 4, len(df))

그룹화

df.groupby('sex').get_group('male').head()
age sex bmi bp s1 s2 s3 s4 s5 s6 label
0 0.038076 male 0.061696 0.021872 -0.044223 -0.034821 -0.043401 -0.002592 0.019908 -0.017646 3
2 0.085299 male 0.044451 -0.005671 -0.045599 -0.034194 -0.032356 -0.002592 0.002864 -0.025930 1
6 -0.045472 male -0.047163 -0.015999 -0.040096 -0.024800 0.000779 -0.039493 -0.062913 -0.038357 1
7 0.063504 male -0.001895 0.066630 0.090620 0.108914 0.022869 0.017703 -0.035817 0.003064 1
8 0.041708 male 0.061696 -0.040099 -0.013953 0.006202 -0.028674 -0.002592 -0.014956 0.011349 3
df.groupby('sex').mean()
age bmi bp s1 s2 s3 s4 s5 s6 label
sex
female -0.007756 -0.003936 -0.010759 -0.001575 -0.006368 0.016923 -0.014826 -0.006693 -0.009291 2.004255
male 0.008805 0.004468 0.012215 0.001788 0.007229 -0.019212 0.016832 0.007598 0.010548 2.048309
df.groupby('sex').size()
sex
female    235
male      207
dtype: int64
df.groupby(['sex', 'label']).mean()
age bmi bp s1 s2 s3 s4 s5 s6
sex label
female 1 -0.007983 -0.011001 -0.012674 -0.002239 -0.009755 0.023223 -0.020272 -0.007147 -0.013663
2 -0.001230 -0.001010 -0.006186 -0.000511 -0.005365 0.011965 -0.009201 -0.000791 -0.001290
3 -0.013975 0.000152 -0.013384 -0.001970 -0.004013 0.015599 -0.015003 -0.012071 -0.012875
male 1 0.007752 -0.003550 0.007619 0.003018 0.005739 -0.013468 0.013457 0.006458 0.012009
2 0.002551 0.004152 0.009424 -0.002898 0.006504 -0.020188 0.015658 0.000316 0.011208
3 0.014395 0.011708 0.018313 0.004214 0.009072 -0.023501 0.020655 0.014032 0.008779
df.groupby('label').mean().sort_values('bmi', ascending=False)
age bmi bp s1 s2 s3 s4 s5 s6
label
3 0.000210 0.005930 0.002464 0.001122 0.002529 -0.003951 0.002826 0.000981 -0.002048
2 0.000398 0.001213 0.000537 -0.001539 -0.000253 -0.001882 0.001505 -0.000314 0.004092
1 -0.000597 -0.007504 -0.003149 0.000228 -0.002482 0.006001 -0.004440 -0.000761 -0.001613

데이터 시각화

import matplotlib.pyplot as plt
plt.plot(df['age'].head(), label='age')
plt.xlabel('x')  # x축
plt.ylabel('y')  # y축
plt.legend()     # 범례
<matplotlib.legend.Legend at 0x7faf8828fbd0>

plt.plot(df['age'].head(), marker='o')
plt.plot(df['bmi'].head(), marker='v', linestyle='none')
plt.plot(df['bp'].head(), linestyle=':', color='g')
plt.plot(df['s1'].head(), 'ro--') # 포맷  color, marker, linestyle
[<matplotlib.lines.Line2D at 0x7faf87eacf50>]

# figsize: 그래프 크기, alpha: 투명도
plt.figure(figsize=(10,5))
plt.plot(df['age'].head(), 'bv:', alpha=0.3, label='age')
plt.plot(df['bmi'].head(), 's-.', label='bmi')
plt.legend(ncol=2)
<matplotlib.legend.Legend at 0x7faf87d222d0>

막대 그래프

plt.bar(df.index[:5], df['age'].head(), color=['r', 'g', 'b', 'r', 'g'])
<BarContainer object of 5 artists>

plt.bar(df.index[:5], df['age'].head(), width=0.3)
plt.xticks(rotation=45)
plt.yticks(rotation=45)
(array([-0.1 , -0.05,  0.  ,  0.05,  0.1 ]),
 <a list of 5 Text major ticklabel objects>)

plt.barh(df.index[:5], df['age'].head())
<BarContainer object of 5 artists>

bar = plt.bar(df.index[:5], df['age'].head())
bar[0].set_hatch('/')
bar[2].set_hatch('x')

for idx, rect in enumerate(bar):
  plt.text(idx, rect.get_height(), df['age'][idx])

데이터 프레임 적용

d = df.sort_values('age')
plt.plot(d['age'], d['bmi'])

plt.grid()

plt.bar(df.index[:5], df['age'].head(), color='r')
plt.bar(df.index[:5], df['bmi'].head(), color='b')
<BarContainer object of 5 artists>

plt.bar(df.index[:5], df['age'].head(), color='r')
plt.bar(df.index[:5], df['bmi'].head(), color='b', bottom=df['age'].head())
plt.bar(df.index[:5], df['s1'].head(), color='g', bottom=df['age'].head() + df['bmi'].head())
<BarContainer object of 5 artists>

w = 0.25

plt.bar(df.index[:5]-w, df['age'].head(), color='r', width=w)
plt.bar(df.index[:5], df['bmi'].head(), color='b', width=w)
plt.bar(df.index[:5]+w, df['s1'].head(), color='g', width=w)
<BarContainer object of 5 artists>

g = df.groupby('sex')

plt.pie(g.size(), labels=g.size().index, autopct='%.1f')
plt.show()




카테고리:

업데이트:

댓글남기기