import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_csv("../dataset/Space_Titanic/train.csv")
test = pd.read_csv("../dataset/Space_Titanic/test.csv")
sub = pd.read_csv("../dataset/Space_Titanic/sample_submission.csv")

train.shape, test.shape, sub.shape

((8693, 14), (4277, 13), (4277, 2))


train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   PassengerId   8693 non-null   object 
 1   HomePlanet    8492 non-null   object 
 2   CryoSleep     8476 non-null   object 
 3   Cabin         8494 non-null   object 
 4   Destination   8511 non-null   object 
 5   Age           8514 non-null   float64
 6   VIP           8490 non-null   object 
 7   RoomService   8512 non-null   float64
 8   FoodCourt     8510 non-null   float64
 9   ShoppingMall  8485 non-null   float64
 10  Spa           8510 non-null   float64
 11  VRDeck        8505 non-null   float64
 12  Name          8493 non-null   object 
 13  Transported   8693 non-null   bool   
dtypes: bool(1), float64(6), object(7)
memory usage: 891.5+ KB


train.head()


train.columns

Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age',
       'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck',
       'Name', 'Transported'],
      dtype='object')


sel = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
X = train[sel]
y = train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier as DTC


imputer = SimpleImputer(strategy = "mean")
scaler = MinMaxScaler()
model = DTC()


from sklearn.pipeline import Pipeline

pipe_line = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line.fit(X_train, y_train)

Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()),
                ('model', DecisionTreeClassifier())])


pipe_line.predict(X_test)

array([False,  True, False, ...,  True,  True,  True])


pipe_line.score(X_test, y_test)

0.7378104875804968


from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier as KNN_C

imputer = SimpleImputer(strategy = "mean")
scaler = StandardScaler()
model = KNN_C()


pipe_line = Pipeline([ ("imputer", imputer), 
                      ("scaler", scaler), 
                      ("model", model) ])

pipe_line.fit(X_train, y_train)
pipe_line.score(X_test, y_test)

0.764949402023919


def pipe_line_fnc(X, imputer, scaler, model):
    X = imputer.transform(X)
    X = scaler.transform(X)
    pred = model.predict(X)
    return pred

pred_Y = pipe_line_fnc(X_test, imputer, scaler, model)

pred_Y[:5]

array([False,  True, False, False,  True])


pred_Y.shape

(2174,)


from sklearn.metrics import accuracy_score
accuracy_score(pred_Y, y_test)

0.764949402023919


from sklearn.metrics import classification_report
print( classification_report(pred_Y, y_test) )

              precision    recall  f1-score   support

       False       0.70      0.80      0.75       934
        True       0.83      0.74      0.78      1240

    accuracy                           0.76      2174
   macro avg       0.76      0.77      0.76      2174
weighted avg       0.77      0.76      0.77      2174


from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
# from sklearn.externals import joblib
import sklearn.externals
import joblib


sel = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
X = train[sel]
y = train['Transported']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)


imputer = SimpleImputer(strategy = "mean")
scaler = MinMaxScaler()
model = LogisticRegression()


pipe_line_Log = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line_Log.fit(X_train, y_train)
pred = pipe_line_Log.predict(X_test)
pred[0:5]

array([ True,  True, False, False,  True])


# 정확도 확인
print( accuracy_score(pred, y_test) )

0.7382704691812327


model = KNeighborsClassifier()


pipe_line_knn = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line_knn.fit(X_train, y_train)
pred = pipe_line_knn.predict(X_test)

# 정확도 확인
print( accuracy_score(pred, y_test) )

0.7723091076356946


joblib.dump(pipe_line_knn, "../dataset/Space_Titanic/model_pipe_knn.joblib" )

['../dataset/Space_Titanic/model_pipe_knn.joblib']


import os
os.listdir("../dataset/Space_Titanic/")

['first_sub.csv',
 'model_pipe_knn.joblib',
 'sample_submission.csv',
 'test.csv',
 'train.csv']


import sklearn.externals
import joblib

pipe_knn = joblib.load("../dataset/Space_Titanic/model_pipe_knn.joblib")

pred = pipe_knn.predict(X_test)

# 정확도 확인
print( accuracy_score(pred, y_test) )

0.7723091076356946

	PassengerId	HomePlanet	CryoSleep	Cabin	Destination	Age	VIP	RoomService	FoodCourt	ShoppingMall	Spa	VRDeck	Name	Transported
0	0001_01	Europa	False	B/0/P	TRAPPIST-1e	39.0	False	0.0	0.0	0.0	0.0	0.0	Maham Ofracculy	False
1	0002_01	Earth	False	F/0/S	TRAPPIST-1e	24.0	False	109.0	9.0	25.0	549.0	44.0	Juanna Vines	True
2	0003_01	Europa	False	A/0/S	TRAPPIST-1e	58.0	True	43.0	3576.0	0.0	6715.0	49.0	Altark Susent	False
3	0003_02	Europa	False	A/0/S	TRAPPIST-1e	33.0	False	0.0	1283.0	371.0	3329.0	193.0	Solam Susent	False
4	0004_01	Earth	False	F/1/S	TRAPPIST-1e	16.0	False	303.0	70.0	151.0	565.0	2.0	Willy Santantines	True

머신러닝 파이프라인¶

학습 목표¶

목차

01 데이터 불러오기

파이프라인에 사용될 임퓨터, 스케일러, 분류 모델 정의¶

02 파이프라인 생성 및 학습

파이프라인을 이용한 예측 수행¶

정리¶

실습¶

모델의 스케일링 함수 변경해 보기¶

Pipeline 클래스의 단점¶

03 파이프라인 함수로 만들기

pipe_line_fnc 함수 만들기¶

최종 결과 73.59%¶

04 학습내용 저장 및 불러오기

불러오기 후, 확인¶