import warnings
warnings.filterwarnings(action='ignore')
# warnings.filterwarnings(action='default')
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import numpy as np
cancer = load_breast_cancer()
print(cancer.data.shape)
(569, 30)
# 고정된 난수를 발생
rng = np.random.RandomState(42)
noise = rng.normal(size=(len(cancer.data), 90))
noise.shape
(569, 90)
# 데이터 노이즈 특성 추가
# 30개는 원본 특성, 다음 90개는 노이즈
X_w_noise = np.hstack([cancer.data, noise])
X_w_noise.shape
(569, 120)
X = X_w_noise # 입력
y = cancer.target # 출력
X_train, X_test, y_train, y_test = train_test_split(X, y,
random_state=0,
test_size=0.3)
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
print("전체 특성 사용(학습) : {:.3f}".format(model.score(X_train, y_train)))
print("전체 특성 사용(테스트) : {:.3f}".format(model.score(X_test, y_test)))
전체 특성 사용(학습) : 1.000 전체 특성 사용(테스트) : 0.953
# 30%를 뽑는 것을 학습
select = SelectPercentile(score_func=f_classif, percentile=30)
select.fit(X_train, y_train)
## 학습 세트에 적용
X_train_selected = select.transform(X_train)
print( "X_train.shape:", X_train.shape)
print( "X_train_selected.shape", X_train_selected.shape)
X_train.shape: (398, 120) X_train_selected.shape (398, 36)
import matplotlib.pyplot as plt
### 어떤 특성이 선택되었는지 확인
mask = select.get_support()
print(mask)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
[ True True True True True True True True True False True False True True False True True True False False True True True True True True True True True True False False False False True False False False True False True False False True False False False False False False False False True False False False False False False True False False True False False False True False True True False False False False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False]
<matplotlib.image.AxesImage at 0x170cad52400>
# 테스트 데이터 변환
X_test_selected = X_test[:, mask]
print(X_test_selected.shape)
model.fit(X_train_selected, y_train)
print("일부 특성 사용(학습) : {:.3f}".format(model.score(X_train_selected, y_train)))
print("일부 특성 사용(테스트): {:.3f}".format(model.score(X_test_selected, y_test)))
(171, 36) 일부 특성 사용(학습) : 1.000 일부 특성 사용(테스트): 0.965
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
select = SelectFromModel(RandomForestClassifier(n_estimators=100,
random_state=42),
threshold="1.5 * median") # median : 특성중요도
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
print("X_train.shape :" , X_train.shape)
print("X_train_l1.shape :", X_train_l1.shape)
X_train.shape : (398, 120) X_train_l1.shape : (398, 36)
### 어떤 특성이 선택되었는지 확인
mask = select.get_support()
print(mask)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("특성 번호")
[ True True True True True True True True True True True False True True False True True True False True True True True True True True True True True True False False True False False False False False False False False True False False False False False False False False False False False False False False True False True False False False False False True False False False False False False False False False False False False False False False False False False False False False False False False True False False False False False False False False False False False False False False False False True False True False False False False False False False True False False False]
Text(0.5, 0, '특성 번호')
select.fit(X_test, y_test)
X_test_l1 = select.transform(X_test)
mask = select.get_support()
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("특성 번호")
Text(0.5, 0, '특성 번호')
select.fit(X_train, y_train)
X_train_l1 = select.transform(X_train)
mask = select.get_support()
X_test_l1 = X_test[:, mask]
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_l1, y_train)
print("일부 특성 사용(학습) : {:.3f}".format(model.score(X_train_l1, y_train)))
print("일부 특성 사용(테스트) : {:.3f}".format(model.score(X_test_l1, y_test)))
# score = LogisticRegression().fit(X_train, y_train).score(X_test_l1, y_test)
일부 특성 사용(학습) : 1.000 일부 특성 사용(테스트) : 0.959
%%time
from sklearn.feature_selection import RFE
select = RFE(RandomForestClassifier(n_estimators=100, random_state=42),
n_features_to_select=36)
select.fit(X_train, y_train)
# 선택된 특성을 표시합니다.
mask = select.get_support()
plt.matshow(mask.reshape(1,-1), cmap='gray_r')
plt.xlabel("특성 번호")
CPU times: total: 17.5 s Wall time: 17.5 s
Text(0.5, 0, '특성 번호')
X_train_rfe = select.transform(X_train)
mask = select.get_support()
model = RandomForestClassifier(n_estimators=100,
random_state=42).fit(X_train_rfe, y_train)
score = model.score(X_train_rfe, y_train)
print("학습용 평가 점수 : {:.3f}".format(score))
X_test_rfe = X_test[:, mask]
score = model.score(X_test_rfe, y_test)
print("테스트용 평가 점수 : {:.3f}".format(score))
학습용 평가 점수 : 1.000 테스트용 평가 점수 : 0.965