import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
cancer = load_breast_cancer()
cancer_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
cancer_df.head()
mean radius | mean texture | mean perimeter | mean area | mean smoothness | mean compactness | mean concavity | mean concave points | mean symmetry | mean fractal dimension | ... | worst radius | worst texture | worst perimeter | worst area | worst smoothness | worst compactness | worst concavity | worst concave points | worst symmetry | worst fractal dimension | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 17.99 | 10.38 | 122.80 | 1001.0 | 0.11840 | 0.27760 | 0.3001 | 0.14710 | 0.2419 | 0.07871 | ... | 25.38 | 17.33 | 184.60 | 2019.0 | 0.1622 | 0.6656 | 0.7119 | 0.2654 | 0.4601 | 0.11890 |
1 | 20.57 | 17.77 | 132.90 | 1326.0 | 0.08474 | 0.07864 | 0.0869 | 0.07017 | 0.1812 | 0.05667 | ... | 24.99 | 23.41 | 158.80 | 1956.0 | 0.1238 | 0.1866 | 0.2416 | 0.1860 | 0.2750 | 0.08902 |
2 | 19.69 | 21.25 | 130.00 | 1203.0 | 0.10960 | 0.15990 | 0.1974 | 0.12790 | 0.2069 | 0.05999 | ... | 23.57 | 25.53 | 152.50 | 1709.0 | 0.1444 | 0.4245 | 0.4504 | 0.2430 | 0.3613 | 0.08758 |
3 | 11.42 | 20.38 | 77.58 | 386.1 | 0.14250 | 0.28390 | 0.2414 | 0.10520 | 0.2597 | 0.09744 | ... | 14.91 | 26.50 | 98.87 | 567.7 | 0.2098 | 0.8663 | 0.6869 | 0.2575 | 0.6638 | 0.17300 |
4 | 20.29 | 14.34 | 135.10 | 1297.0 | 0.10030 | 0.13280 | 0.1980 | 0.10430 | 0.1809 | 0.05883 | ... | 22.54 | 16.67 | 152.20 | 1575.0 | 0.1374 | 0.2050 | 0.4000 | 0.1625 | 0.2364 | 0.07678 |
5 rows × 30 columns
cancer_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 569 entries, 0 to 568 Data columns (total 30 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 mean radius 569 non-null float64 1 mean texture 569 non-null float64 2 mean perimeter 569 non-null float64 3 mean area 569 non-null float64 4 mean smoothness 569 non-null float64 5 mean compactness 569 non-null float64 6 mean concavity 569 non-null float64 7 mean concave points 569 non-null float64 8 mean symmetry 569 non-null float64 9 mean fractal dimension 569 non-null float64 10 radius error 569 non-null float64 11 texture error 569 non-null float64 12 perimeter error 569 non-null float64 13 area error 569 non-null float64 14 smoothness error 569 non-null float64 15 compactness error 569 non-null float64 16 concavity error 569 non-null float64 17 concave points error 569 non-null float64 18 symmetry error 569 non-null float64 19 fractal dimension error 569 non-null float64 20 worst radius 569 non-null float64 21 worst texture 569 non-null float64 22 worst perimeter 569 non-null float64 23 worst area 569 non-null float64 24 worst smoothness 569 non-null float64 25 worst compactness 569 non-null float64 26 worst concavity 569 non-null float64 27 worst concave points 569 non-null float64 28 worst symmetry 569 non-null float64 29 worst fractal dimension 569 non-null float64 dtypes: float64(30) memory usage: 133.5 KB
print( cancer_df.shape)
(569, 30)
# 피처와 레이블를 지정.
X = cancer_df[:]
y = cancer.target
X.shape, y.shape
((569, 30), (569,))
from sklearn.model_selection import train_test_split
X_train , X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((455, 30), (114, 30), (455,), (114,))
# 모델 선택
model_log = LogisticRegression()
# 학습
model_log.fit(X_train, y_train)
# 예측
pred = model_log.predict(X_test)
pred[:15]
C:\Users\withJesus\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py:762: ConvergenceWarning: lbfgs failed to converge (status=1): STOP: TOTAL NO. of ITERATIONS REACHED LIMIT. Increase the number of iterations (max_iter) or scale the data as shown in: https://scikit-learn.org/stable/modules/preprocessing.html Please also refer to the documentation for alternative solver options: https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression n_iter_i = _check_optimize_result(
array([0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1])
print("LogisticRegression 분류기 정확도 : {0:.4f}".format(accuracy_score(y_test, pred) ) )
LogisticRegression 분류기 정확도 : 0.9474
# 모델 선택
model_knn = KNeighborsClassifier()
# 학습
model_knn.fit(X_train, y_train)
# 예측
pred = model_knn.predict(X_test)
pred[:15]
array([0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0])
print("KNeighborsClassifier 분류기 정확도 : {0:.4f}".format(accuracy_score(y_test, pred) ) )
LogisticRegression 분류기 정확도 : 0.9386
from sklearn.ensemble import VotingClassifier
# 모델 선택
model_log = LogisticRegression()
model_knn = KNeighborsClassifier(n_neighbors=8)
# 개별 모델을 소프트 보팅 기반의 앙상블 모델로 구현
vo_clf = VotingClassifier( estimators=[("LR", model_log) ,
("KNN", model_knn)] , voting='soft') # 기본값 voting=hard
X_train , X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((455, 30), (114, 30), (455,), (114,))
### VotingClassifier 학습/예측 평가
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
print("Voting 분류기 정확도 : {0:.4f}".format(accuracy_score(y_test, pred)))
Voting 분류기 정확도 : 0.9561
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
X_train , X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((455, 30), (114, 30), (455,), (114,))
# 모델 선택
model_rf = RandomForestClassifier()
# 학습
model_rf.fit(X_train, y_train)
# 예측
pred = model_rf.predict(X_test)
print( pred[:15] )
print("RandomForestClassifier 분류기 정확도 : {0:.4f}".format(accuracy_score(y_test, pred) ) )
[0 1 1 1 1 1 1 1 1 1 0 1 1 0 0] RandomForestClassifier 분류기 정확도 : 0.9561