import pandas as pd
from sklearn.model_selection import train_test_split
train = pd.read_csv("../dataset/Space_Titanic/train.csv")
test = pd.read_csv("../dataset/Space_Titanic/test.csv")
sub = pd.read_csv("../dataset/Space_Titanic/sample_submission.csv")
train.shape, test.shape, sub.shape
((8693, 14), (4277, 13), (4277, 2))
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8693 entries, 0 to 8692 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PassengerId 8693 non-null object 1 HomePlanet 8492 non-null object 2 CryoSleep 8476 non-null object 3 Cabin 8494 non-null object 4 Destination 8511 non-null object 5 Age 8514 non-null float64 6 VIP 8490 non-null object 7 RoomService 8512 non-null float64 8 FoodCourt 8510 non-null float64 9 ShoppingMall 8485 non-null float64 10 Spa 8510 non-null float64 11 VRDeck 8505 non-null float64 12 Name 8493 non-null object 13 Transported 8693 non-null bool dtypes: bool(1), float64(6), object(7) memory usage: 891.5+ KB
train.head()
PassengerId | HomePlanet | CryoSleep | Cabin | Destination | Age | VIP | RoomService | FoodCourt | ShoppingMall | Spa | VRDeck | Name | Transported | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0001_01 | Europa | False | B/0/P | TRAPPIST-1e | 39.0 | False | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | Maham Ofracculy | False |
1 | 0002_01 | Earth | False | F/0/S | TRAPPIST-1e | 24.0 | False | 109.0 | 9.0 | 25.0 | 549.0 | 44.0 | Juanna Vines | True |
2 | 0003_01 | Europa | False | A/0/S | TRAPPIST-1e | 58.0 | True | 43.0 | 3576.0 | 0.0 | 6715.0 | 49.0 | Altark Susent | False |
3 | 0003_02 | Europa | False | A/0/S | TRAPPIST-1e | 33.0 | False | 0.0 | 1283.0 | 371.0 | 3329.0 | 193.0 | Solam Susent | False |
4 | 0004_01 | Earth | False | F/1/S | TRAPPIST-1e | 16.0 | False | 303.0 | 70.0 | 151.0 | 565.0 | 2.0 | Willy Santantines | True |
train.columns
Index(['PassengerId', 'HomePlanet', 'CryoSleep', 'Cabin', 'Destination', 'Age', 'VIP', 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'Name', 'Transported'], dtype='object')
sel = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
X = train[sel]
y = train['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier as DTC
imputer = SimpleImputer(strategy = "mean")
scaler = MinMaxScaler()
model = DTC()
from sklearn.pipeline import Pipeline
pipe_line = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line.fit(X_train, y_train)
Pipeline(steps=[('imputer', SimpleImputer()), ('scaler', MinMaxScaler()), ('model', DecisionTreeClassifier())])
pipe_line.predict(X_test)
array([False, True, False, ..., True, True, True])
pipe_line.score(X_test, y_test)
0.7378104875804968
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier as KNN_C
imputer = SimpleImputer(strategy = "mean")
scaler = StandardScaler()
model = KNN_C()
pipe_line = Pipeline([ ("imputer", imputer),
("scaler", scaler),
("model", model) ])
pipe_line.fit(X_train, y_train)
pipe_line.score(X_test, y_test)
0.764949402023919
def pipe_line_fnc(X, imputer, scaler, model):
X = imputer.transform(X)
X = scaler.transform(X)
pred = model.predict(X)
return pred
pred_Y = pipe_line_fnc(X_test, imputer, scaler, model)
pred_Y[:5]
array([False, True, False, False, True])
pred_Y.shape
(2174,)
from sklearn.metrics import accuracy_score
accuracy_score(pred_Y, y_test)
0.764949402023919
from sklearn.metrics import classification_report
print( classification_report(pred_Y, y_test) )
precision recall f1-score support False 0.70 0.80 0.75 934 True 0.83 0.74 0.78 1240 accuracy 0.76 2174 macro avg 0.76 0.77 0.76 2174 weighted avg 0.77 0.76 0.77 2174
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
# from sklearn.externals import joblib
import sklearn.externals
import joblib
sel = [ 'RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck' ]
X = train[sel]
y = train['Transported']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
imputer = SimpleImputer(strategy = "mean")
scaler = MinMaxScaler()
model = LogisticRegression()
pipe_line_Log = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line_Log.fit(X_train, y_train)
pred = pipe_line_Log.predict(X_test)
pred[0:5]
array([ True, True, False, False, True])
# 정확도 확인
print( accuracy_score(pred, y_test) )
0.7382704691812327
model = KNeighborsClassifier()
pipe_line_knn = Pipeline([ ("imputer", imputer), ("scaler", scaler), ("model", model) ])
pipe_line_knn.fit(X_train, y_train)
pred = pipe_line_knn.predict(X_test)
# 정확도 확인
print( accuracy_score(pred, y_test) )
0.7723091076356946
joblib.dump(pipe_line_knn, "../dataset/Space_Titanic/model_pipe_knn.joblib" )
['../dataset/Space_Titanic/model_pipe_knn.joblib']
import os
os.listdir("../dataset/Space_Titanic/")
['first_sub.csv', 'model_pipe_knn.joblib', 'sample_submission.csv', 'test.csv', 'train.csv']
import sklearn.externals
import joblib
pipe_knn = joblib.load("../dataset/Space_Titanic/model_pipe_knn.joblib")
pred = pipe_knn.predict(X_test)
# 정확도 확인
print( accuracy_score(pred, y_test) )
0.7723091076356946