from IPython.display import display, Image
display(Image(filename='img/onehotencoding.png'))


### 01. 데이터 준비
import pandas as pd
data = { "eng": ["b", "c", "a", "d"] }
df = pd.DataFrame(data)
print(type(df))
df

<class 'pandas.core.frame.DataFrame'>


from sklearn.preprocessing import LabelEncoder, OneHotEncoder


en_x = LabelEncoder()
df['라벨인코딩'] = en_x.fit_transform(df['eng'])
df


df['라벨인코딩'].values

array([1, 2, 0, 3])


onehot = OneHotEncoder()
val = df['라벨인코딩'].values.reshape(-1,1) # OneHotEncoder()를 사용을 위한 적합한 값으로 변경.
y = onehot.fit_transform( val ).toarray()    # 값을 변경후, 배열로 만들어준다.
y

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [1., 0., 0., 0.],
       [0., 0., 0., 1.]])


onehot_val = pd.DataFrame(y, dtype=int)
onehot_val


df_new = pd.concat([df, onehot_val], axis=1)
df_new


data = { "회사명": ["MS","Apple", "Google", "Google"]}
df1 = pd.DataFrame(data)
df2 = df1.copy()
df2


### OneHotEncoding
from sklearn.preprocessing import LabelEncoder, OneHotEncoder


df1['회사명']

0        MS
1     Apple
2    Google
3    Google
Name: 회사명, dtype: object


### LabelEncoder
encoder_x = LabelEncoder()
df1['lbl_en'] = encoder_x.fit_transform(df1['회사명'])  # 
df1


df1['lbl_en'].values

array([2, 0, 1, 1])


onehot = OneHotEncoder()
y = onehot.fit_transform(df1['lbl_en'].values.reshape(-1,1) ).toarray()
print(y)

[[0. 0. 1.]
 [1. 0. 0.]
 [0. 1. 0.]
 [0. 1. 0.]]


# 변경된 값을 DataFrame형태로 변경
dx = pd.DataFrame(y, dtype=int)
dx


df1_new = pd.concat([df1, dx], axis=1)
df1_new


from tensorflow.keras.utils import to_categorical
import numpy as np

# define example
data = [15,17,5,10,0]
dat = np.array(data)
print(dat)

# one hot encode
encoded = to_categorical(dat)
print(encoded)

[15 17  5 10  0]
[[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]]


# invert encoding
inverted = np.argmax(encoded[1])
print(inverted)

17


import pandas as pd
import os


demo_df = pd.DataFrame({"범주형_feature":['양말', '여우', '양말', '상자']})
display(demo_df)


onehot = pd.get_dummies(demo_df)
onehot


df = pd.concat([demo_df, onehot], axis=1)
df

ch04 데이터 표현 특성공학¶

학습 내용¶

01 용어 이해해보기¶

1-1. 연속형, 범주형 feature¶

1-2. 특성 공학(feature engineering)¶

1-3. Label Encoding을 알아보기¶

1-4. What is One Hot Encoding?(One Hot Encoding은 무엇인가?)¶

1-5. Why do you need one hot encoding?¶

(왜 필요할까?)¶

Label 인코딩의 오류¶

Label 의 인코딩의 문제는 범주값이 높을수록 카테고리가 더 우수하다고 가정합니다.¶

02 왜 사용하나?¶

03 레이블 인코딩, 원핫 인코딩 실습해 보기(1)¶

LabelEncoder 사용하기¶

데이터를 전처리¶

원핫 인코딩(OneHotEncoding) 실습¶

04 레이블 인코딩, 원핫 인코딩 실습해 보기(2)¶

OneHotEncoding¶

05 Keras를 활용한 원핫 인코딩¶

실습¶

06. Pandas를 활용한 원핫 인코딩¶

과제¶

History¶

	eng
0	b
1	c
2	a
3	d

	eng	라벨인코딩
0	b	1
1	c	2
2	a	0
3	d	3

	eng	라벨인코딩	0	1	2	3
0	b	1	0	1	0	0
1	c	2	0	0	1	0
2	a	0	1	0	0	0
3	d	3	0	0	0	1

	범주형_feature
0	양말
1	여우
2	양말
3	상자