import pandas as pd
import seaborn as sns
import numpy as np
print(pd.__version__)
iris = sns.load_dataset("iris")
iris
2.0.3
sepal_length | sepal_width | petal_length | petal_width | species | |
---|---|---|---|---|---|
0 | 5.1 | 3.5 | 1.4 | 0.2 | setosa |
1 | 4.9 | 3.0 | 1.4 | 0.2 | setosa |
2 | 4.7 | 3.2 | 1.3 | 0.2 | setosa |
3 | 4.6 | 3.1 | 1.5 | 0.2 | setosa |
4 | 5.0 | 3.6 | 1.4 | 0.2 | setosa |
... | ... | ... | ... | ... | ... |
145 | 6.7 | 3.0 | 5.2 | 2.3 | virginica |
146 | 6.3 | 2.5 | 5.0 | 1.9 | virginica |
147 | 6.5 | 3.0 | 5.2 | 2.0 | virginica |
148 | 6.2 | 3.4 | 5.4 | 2.3 | virginica |
149 | 5.9 | 3.0 | 5.1 | 1.8 | virginica |
150 rows × 5 columns
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
# 데이터 입력과 출력으로 나누기
X = iris.iloc[:, 0:4] # 1열~4열
y = iris.iloc[:, 4] # 5열 - species
X.shape, y.shape
((150, 4), (150,))
y
0 setosa 1 setosa 2 setosa 3 setosa 4 setosa ... 145 virginica 146 virginica 147 virginica 148 virginica 149 virginica Name: species, Length: 150, dtype: object
# 데이터를 학습용과 검증용으로 분류
# test 30%, train 70% 로 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size= 0.3)
# 모델 선택 - 앙상블
model = RandomForestClassifier(max_depth=5, n_estimators=10)
model.fit(X_train, y_train)
RandomForestClassifier(max_depth=5, n_estimators=10)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(max_depth=5, n_estimators=10)
# 학습한 모델로 예측 수행
y_pred = model.predict(X_test)
print( len(y_pred) )
print( y_pred[0:10] )
45 ['versicolor' 'versicolor' 'virginica' 'versicolor' 'setosa' 'versicolor' 'setosa' 'versicolor' 'versicolor' 'setosa']
df_iris = pd.DataFrame(list(zip(y_pred, y_test)), columns=['pred_val', 'actual'])
df_iris.head()
pred_val | actual | |
---|---|---|
0 | versicolor | versicolor |
1 | versicolor | versicolor |
2 | virginica | virginica |
3 | versicolor | versicolor |
4 | setosa | setosa |
df_iris.loc[ df_iris['pred_val'] == df_iris['actual'], 'correct' ] = 1
df_iris.head(10)
pred_val | actual | correct | |
---|---|---|---|
0 | versicolor | versicolor | 1.0 |
1 | versicolor | versicolor | 1.0 |
2 | virginica | virginica | 1.0 |
3 | versicolor | versicolor | 1.0 |
4 | setosa | setosa | 1.0 |
5 | versicolor | versicolor | 1.0 |
6 | setosa | setosa | 1.0 |
7 | versicolor | versicolor | 1.0 |
8 | versicolor | versicolor | 1.0 |
9 | setosa | setosa | 1.0 |
df_iris.correct.value_counts()
correct 1.0 40 Name: count, dtype: int64
np.mean( df_iris['correct'] )
1.0