필드명 | 설명 |
---|---|
datetime | hourly date + timestamp |
season | 1 = spring(봄), 2 = summer(여름), 3 = fall(가을), 4 = winter(겨울) |
holiday | whether the day is considered a holiday(휴일인지 아닌지) |
workingday | whether the day is neither a weekend nor holiday(주말도 휴일도 아닌 날인지) |
weather | 1: Clear, Few clouds, Partly cloudy, Partly cloudy 2: Mist + Cloudy, Mist + Broken clouds, Mist + Few clouds, Mist 3: Light Snow, Light Rain + Thunderstorm + Scattered clouds, Light Rain + Scattered clouds 4: Heavy Rain + Ice Pallets + Thunderstorm + Mist, Snow + Fog |
temp | temperature in Celsius (온도) |
atemp | "feels like" temperature in Celsius (체감온도) |
humidity | relative humidity (습도) |
windspeed | wind speed (바람속도) |
casual | number of non-registered user rentals initiated (비가입자 사용유저) |
registered | number of registered user rentals initiated (가입자 사용유저) |
count | number of total rentals (전체 렌탈 대수) |
import pandas as pd
import matplotlib.pyplot as plt ## seaborn 보다 고급 시각화 가능. but 코드 복잡
import seaborn as sns ## seaborn은 matplotlib보다 간단하게 사용 가능
import numpy as np # 절대값, 수학함수 등
train = pd.read_csv("../bike/train.csv", parse_dates=['datetime'])
test = pd.read_csv("../bike/test.csv", parse_dates=['datetime'])
col_names = [ 'season', 'holiday', 'workingday', 'weather' ]
i = 0
plt.figure(figsize=(12,10)) # 전체 그래프의 크기 지정
for name in col_names: # 컬럼명을 전달 리스트 수 만큼 반복 -> 4회
i = i + 1 # 숫자를 1씩 증가.
plt.subplot(2,2,i) # 2행 2열에 i번째 그래프 선택
sns.countplot(x=name, data=train) # i번째 그래프에 sns.countplot를 그리겠다.
plt.show() # 주피터에서 보여주지만, 다른곳(editor, pycharm)에서는 이걸 실행시켜야 한다.
### temp, atemp, humidity, windspeed
num_names = ['temp', 'atemp', 'humidity', 'windspeed']
train.columns
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'], dtype='object')
i = 0
plt.figure(figsize=(12,25)) # 전체 그래프의 크기 지정 (가로, 세로)
for name in num_names: # 컬럼명을 전달 리스트 수 만큼 반복 -> 4회
i = i + 1 # 숫자를 1씩 증가.
plt.subplot(4,2,i*2-1) # 2행 2열에 i번째 그래프 선택
sns.histplot(x=name, data=train) # i번째 그래프에 sns.histplot를 그리겠다.
plt.title("train feature")
plt.subplot(4,2,i*2) # 2행 2열에 i번째 그래프 선택
sns.histplot(x=name, data=test) # i번째 그래프에 sns.histplot를 그리겠다.
plt.title("test feature")
plt.show()
new_tr = train.copy()
new_test = test.copy()
new_tr.columns
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'], dtype='object')
## 더미변수, 파생변수 생성
new_tr['year'] = new_tr['datetime'].dt.year
new_tr.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | year | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 | 2011 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 | 2011 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 | 2011 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 | 2011 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 | 2011 |
new_tr['month'] = new_tr['datetime'].dt.month
new_tr['day'] = new_tr['datetime'].dt.day
new_tr['hour'] = new_tr['datetime'].dt.hour
new_tr['minute'] = new_tr['datetime'].dt.minute
new_tr['second'] = new_tr['datetime'].dt.second
new_tr['dayofweek'] = new_tr['datetime'].dt.dayofweek
new_tr.head()
datetime | season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | year | month | day | hour | minute | second | dayofweek | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2011-01-01 00:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 3 | 13 | 16 | 2011 | 1 | 1 | 0 | 0 | 0 | 5 |
1 | 2011-01-01 01:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 8 | 32 | 40 | 2011 | 1 | 1 | 1 | 0 | 0 | 5 |
2 | 2011-01-01 02:00:00 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 5 | 27 | 32 | 2011 | 1 | 1 | 2 | 0 | 0 | 5 |
3 | 2011-01-01 03:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 3 | 10 | 13 | 2011 | 1 | 1 | 3 | 0 | 0 | 5 |
4 | 2011-01-01 04:00:00 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 0 | 1 | 1 | 2011 | 1 | 1 | 4 | 0 | 0 | 5 |
train.columns
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'], dtype='object')
datetime_names = ['year', 'month', 'day', 'hour', 'minute', 'second']
plt.figure(figsize=(15,15))
for idx, name in enumerate(datetime_names):
plt.subplot(3,2,idx+1)
sns.barplot(x=name, y='count', data=new_tr)
plt.show()
new_test['year'] = new_test['datetime'].dt.year
new_test['month'] = new_test['datetime'].dt.month
new_test['day'] = new_test['datetime'].dt.day
new_test['dayofweek'] = new_test['datetime'].dt.dayofweek
new_test['hour'] = new_test['datetime'].dt.hour
new_test['minute'] = new_test['datetime'].dt.minute
new_test['second'] = new_test['datetime'].dt.second
col_names = ['year','month','day','hour','dayofweek']
i = 0
plt.figure(figsize=(12,35)) ##전체 그래프 크기 지정
for name in col_names: ## 컬럼명으로 반복
i = i+1
plt.subplot(6,2,i) ##2행2열, i = 1,2,3,4 (왼쪽 상단부터 시계방향으로 순번 지정)
sns.countplot(x = name, data = new_tr)
plt.title("train feature")
i = i+1
plt.subplot(6,2,i) ##2행2열, i = 1,2,3,4 (왼쪽 상단부터 시계방향으로 순번 지정)
sns.countplot(x = name, data = new_test)
plt.title("test feature")
plt.show()
new_tr['dayofweek'] = new_tr['datetime'].dt.dayofweek # Monday=0, Sunday=6
datetime_names = ['year', 'month', 'day', 'hour', 'dayofweek', 'second']
i=0
plt.figure(figsize=(15,15))
for name in datetime_names:
i = i + 1
plt.subplot(3,2,i)
sns.barplot(x=name, y='count', data=new_tr)
plt.show()
print(new_test.shape)
new_test[["datetime", "year", "month", "day", "hour",
"minute", "second", "dayofweek"]].head()
(6493, 16)
datetime | year | month | day | hour | minute | second | dayofweek | |
---|---|---|---|---|---|---|---|---|
0 | 2011-01-20 00:00:00 | 2011 | 1 | 20 | 0 | 0 | 0 | 3 |
1 | 2011-01-20 01:00:00 | 2011 | 1 | 20 | 1 | 0 | 0 | 3 |
2 | 2011-01-20 02:00:00 | 2011 | 1 | 20 | 2 | 0 | 0 | 3 |
3 | 2011-01-20 03:00:00 | 2011 | 1 | 20 | 3 | 0 | 0 | 3 |
4 | 2011-01-20 04:00:00 | 2011 | 1 | 20 | 4 | 0 | 0 | 3 |
plt.figure(figsize=(15,10))
g = sns.heatmap(new_tr.corr(), annot=True, fmt=".2f", cmap="coolwarm", cbar=False)
feature_names = [ 'season', 'holiday', 'workingday', 'weather',
'temp', 'atemp', 'humidity', 'windspeed',
"year", "hour", "dayofweek"] # 사용할 특징 선택
X_tr_all = new_tr[feature_names] # 학습용 데이터 특징(변수) 선택
X_test_all = new_test[feature_names] # 테스트 데이터의 특징(변수) 선택
X_tr_all.head()
season | holiday | workingday | weather | temp | atemp | humidity | windspeed | year | hour | dayofweek | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 81 | 0.0 | 2011 | 0 | 5 |
1 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 2011 | 1 | 5 |
2 | 1 | 0 | 0 | 1 | 9.02 | 13.635 | 80 | 0.0 | 2011 | 2 | 5 |
3 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 2011 | 3 | 5 |
4 | 1 | 0 | 0 | 1 | 9.84 | 14.395 | 75 | 0.0 | 2011 | 4 | 5 |
label_name = 'count' # 렌탈 대수 (종속변수 or target)
y_tr_all = new_tr[label_name] # 렌탈 대수 특징(변수) 선택
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tr_all,
y_tr_all,
test_size=0.3,
random_state=77)
from sklearn.linear_model import LinearRegression
model = LinearRegression() # 모델 객체 생성.
model.fit(X_train, y_train)
pred = model.predict(X_test)
# 모델 평가 - 결정계수 확인
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))
# MSE(mean squared error) 확인
mse_val = ( (pred - y_test) ** 2 ).sum() / len(pred)
print("mse value : {:.3f}".format(mse_val))
학습용 데이터 결정계수: 0.391 테스트 데이터 결정계수: 0.377 mse value : 20134.965
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor() # 모델 객체 생성.
model.fit(X_train, y_train)
pred = model.predict(X_test)
# 모델 평가 - 결정계수 확인
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))
# MSE(mean squared error) 확인
mse_val = ( (pred - y_test) ** 2 ).sum() / len(pred)
print("mse value : {:.3f}".format(mse_val))
학습용 데이터 결정계수: 1.000 테스트 데이터 결정계수: 0.893 mse value : 3450.174
from sklearn.ensemble import RandomForestRegressor # 앙상블(의사결정트리 확장판)
seed = 37
model = RandomForestRegressor(n_jobs=-1, random_state=seed) # 모델 객체 생성.
model.fit(X_train, y_train) # 모델 학습(공부가 되었다.)
pred = model.predict(X_test)
# 결정계수 확인
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))
# MSE(mean squared error) 확인
mse_val = ( (pred - y_test) ** 2 ).sum() / len(pred)
print("mse value : {:.3f}".format(mse_val))
학습용 데이터 결정계수: 0.992 테스트 데이터 결정계수: 0.946 mse value : 1752.403
from sklearn.ensemble import GradientBoostingRegressor
seed = 37
model = GradientBoostingRegressor(random_state=seed) # 모델 객체 생성.
model.fit(X_train, y_train) # 모델 학습(공부가 되었다.)
pred = model.predict(X_test)
# 결정계수 확인
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))
# MSE(mean squared error) 확인
mse_val = ( (pred - y_test) ** 2 ).sum() / len(pred)
print("mse value : {:.3f}".format(mse_val))
학습용 데이터 결정계수: 0.860 테스트 데이터 결정계수: 0.849 mse value : 4892.054
sub = pd.read_csv("../bike/sampleSubmission.csv")
sub.head()
datetime | count | |
---|---|---|
0 | 2011-01-20 00:00:00 | 0 |
1 | 2011-01-20 01:00:00 | 0 |
2 | 2011-01-20 02:00:00 | 0 |
3 | 2011-01-20 03:00:00 | 0 |
4 | 2011-01-20 04:00:00 | 0 |
from sklearn.ensemble import RandomForestRegressor # 앙상블(의사결정트리 확장판)
seed = 37
model = RandomForestRegressor(n_jobs=-1, random_state=seed) # 모델 객체 생성.
model.fit(X_train, y_train) # 모델 학습(공부가 되었다.)
RandomForestRegressor(n_jobs=-1, random_state=37)
pred = model.predict(X_test_all) # 예측
sub['count'] = pred
sub.loc[sub['count'] < 0, 'count'] = 0
sub.head(3)
datetime | count | |
---|---|---|
0 | 2011-01-20 00:00:00 | 11.75 |
1 | 2011-01-20 01:00:00 | 4.20 |
2 | 2011-01-20 02:00:00 | 4.78 |
# 처음 만는 제출용 csv 파일, 행번호를 없애기
sub.to_csv("third_sub.csv", index=False)
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PolynomialFeatures
feature_names = [ 'season', 'holiday', 'workingday', 'weather',
'temp', 'atemp', 'humidity', 'windspeed',
"year", "hour", "dayofweek"] # 공통 변수
X_tr_all = new_tr[feature_names] # 학습용 데이터 변수 선택
X_test_all = new_test[feature_names] # 테스트 데이터의 변수 선택
print(X_tr_all.head())
season holiday workingday weather temp atemp humidity windspeed \ 0 1 0 0 1 9.84 14.395 81 0.0 1 1 0 0 1 9.02 13.635 80 0.0 2 1 0 0 1 9.02 13.635 80 0.0 3 1 0 0 1 9.84 14.395 75 0.0 4 1 0 0 1 9.84 14.395 75 0.0 year hour dayofweek 0 2011 0 5 1 2011 1 5 2 2011 2 5 3 2011 3 5 4 2011 4 5
scaler = MinMaxScaler().fit(X_tr_all)
nor_X_tr_all = scaler.transform(X_tr_all)
ex_X_tr = PolynomialFeatures(degree=2,
include_bias=False).fit_transform(nor_X_tr_all)
y_tr_all = new_tr['count']
X_train, X_test, y_train, y_test = train_test_split(ex_X_tr,
y_tr_all,
test_size=0.2,
random_state=42)
def rmlse(predict, actual):
log_y = np.log1p(predict)
log_pred = np.log1p(actual)
squared_error = (log_y - log_pred) ** 2
rmsle_value = np.sqrt(np.mean(squared_error))
return rmsle_value
from sklearn.ensemble import RandomForestRegressor # 앙상블(의사결정트리 확장판)
seed = 37
model = RandomForestRegressor(n_jobs=-1, random_state=seed) # 모델 객체 생성.
model.fit(X_train, y_train) # 모델 학습
pred = model.predict(X_test)
# 결정계수 확인
print("학습용 데이터 결정계수: {:.3f}".format(model.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(model.score(X_test, y_test)))
# MAE, MSE, RMSE, RMLSE
mae_val = ( np.abs(pred - y_test) ).sum() / len(pred)
mse_val = ( (pred - y_test)**2 ).sum() / len(pred)
rmse_val = np.sqrt( mse_val )
rmlse_val = rmlse(pred, y_test)
print("MAE : {:.3f}".format( mae_val ))
print("MSE : {:.3f}".format( mse_val ))
print("RMSE : {:.3f}".format( rmse_val ))
print("RMLSE : {:.3f}".format( rmlse_val ))
학습용 데이터 결정계수: 0.992 테스트 데이터 결정계수: 0.950 MAE : 25.784 MSE : 1659.031 RMSE : 40.731 RMLSE : 0.342
nor_X_test_all = scaler.transform(X_test_all)
ex_X_test = PolynomialFeatures(degree=2,
include_bias=False).fit_transform(nor_X_test_all)
pred = model.predict(ex_X_test) # 예측
sub['count'] = pred
sub.loc[sub['count'] < 0, 'count'] = 0
sub.head(3)
datetime | count | |
---|---|---|
0 | 2011-01-20 00:00:00 | 13.66 |
1 | 2011-01-20 01:00:00 | 5.08 |
2 | 2011-01-20 02:00:00 | 3.53 |
sub.to_csv("four_sub.csv", index=False)
import warnings
warnings.filterwarnings(action='ignore') # 경고 메시지 출력은 이제 그만
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor
print(X_train.shape, y_train.shape)
(8708, 77) (8708,)
select = SelectFromModel(RandomForestRegressor(n_estimators=100,
random_state=37),
threshold="0.1 * median")
select.fit(X_train, y_train)
SelectFromModel(estimator=RandomForestRegressor(random_state=37), threshold='0.1 * median')
X_train_l1 = select.transform(X_train)
X_test_l1 = select.transform(X_test)
X_train_l1.shape, X_test_l1.shape
((8708, 68), (2178, 68))
### 어떤 특성이 선택되었는지 확인
mask = select.get_support()
print(mask)
plt.matshow(mask.reshape(1, -1), cmap='gray_r')
plt.xlabel("feature number")
[ True False True True True True True True True True True True False True True True True True True True True True False False False True True True False False False False True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True True]
Text(0.5, 0, 'feature number')
seed = 37
model = RandomForestRegressor(n_jobs=-1, random_state=seed) # 모델 객체 생성.
model.fit(X_train_l1, y_train) # 모델 학습
pred = model.predict(X_test_l1)
# MAE, MSE, RMSE, RMLSE
mae_val = ( np.abs(pred - y_test) ).sum() / len(pred)
mse_val = ( (pred - y_test)**2 ).sum() / len(pred)
rmse_val = np.sqrt( mse_val )
rmlse_val = rmlse(pred, y_test)
print("MAE : {:.3f}".format( mae_val ))
print("MSE : {:.3f}".format( mse_val ))
print("RMSE : {:.3f}".format( rmse_val ))
print("RMLSE : {:.3f}".format( rmlse_val ))
MAE : 25.675 MSE : 1644.395 RMSE : 40.551 RMLSE : 0.342
X_test_l1_all = select.transform(ex_X_test)
X_test_l1_all.shape
(6493, 68)
pred = model.predict(X_test_l1_all)
sub['count'] = pred
sub.loc[ sub['count'] <0 , 'count' ] =0
sub.to_csv('five_sub.csv', index=False)
import xgboost as xgb
### xgb.DMatrix
# * dense matrix, sparse matrix, local file로부터 DMatrix object 객체를 만든다.
# data_dmatrix = xgb.DMatrix(data=ex_X_tr,label=y_tr_all)
X_train, X_test, y_train, y_test = train_test_split(ex_X_tr,
y_tr_all,
test_size=0.2,
random_state=42)
# 기본 옵션 확인
xg_reg = xgb.XGBRegressor()
xg_reg
XGBRegressor(base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=None, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=None, max_bin=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=None, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, predictor=None, random_state=None, reg_alpha=None, reg_lambda=None, ...)
파라마터명 | 설명 | 사이킷런 기본값(파이썬기반) |
---|---|---|
learning_rate(or eta) | 0~1사이의 값. 과적합을 방지하기 위한 학습률 값 | 기본값 : 0.1(0.3) |
n_estimators(or num_boost_rounds) | 트리의 수 | 기본값 100(10) |
max_depth | 각각의 나무 모델의 최대 깊이 | 기본값 3(6) |
subsample | 각 나무마다 사용하는 데이터 샘플 비율 낮은 값은 underfitting(과소적합)을 야기할 수 있음. |
기본값 : 1 |
colsample_bytree | 각 나무마다 사용하는 feature 비율. High value can lead to overfitting. |
기본값 : 1 |
reg_alpha(or alpha) | L1 규제에 대한 항 피처가 많을 수록 적용을 검토한다. |
기본값 : 0 |
reg_lambda(or lambda) | L2 규제의 적용 값. 피처의 개수가 많을 경우 적용 검토 | 기본값 : 1 |
scale_pos_weight | 불균형 데이터셋의 균형 유지 | 기본값 : 1 |
파라마터명 | 설명 | 사이킷런 기본값(파이썬기반) |
---|---|---|
objective(목적함수) | reg:linear for regression problems(회귀 문제), reg:logistic for classification problems with only decision(분류 문제), binary:logistic for classification problems with probability.(이진 분류) |
feature_names = [ 'season', 'holiday', 'workingday', 'weather',
'temp', 'atemp', 'humidity', 'windspeed',
"year", "hour", "dayofweek"] # 공통 변수
X_tr_all = new_tr[feature_names] # 학습용 데이터 변수 선택
X_test_all = new_test[feature_names] # 테스트 데이터의 변수 선택
print(X_tr_all.head())
season holiday workingday weather temp atemp humidity windspeed \ 0 1 0 0 1 9.84 14.395 81 0.0 1 1 0 0 1 9.02 13.635 80 0.0 2 1 0 0 1 9.02 13.635 80 0.0 3 1 0 0 1 9.84 14.395 75 0.0 4 1 0 0 1 9.84 14.395 75 0.0 year hour dayofweek 0 2011 0 5 1 2011 1 5 2 2011 2 5 3 2011 3 5 4 2011 4 5
ex_X_tr = PolynomialFeatures(degree=2,
include_bias=False).fit_transform(X_tr_all)
y_tr_all = new_tr['count']
X_tr_all.shape, ex_X_tr.shape
((10886, 11), (10886, 77))
X_train, X_test, y_train, y_test = train_test_split(ex_X_tr,
y_tr_all,
test_size=0.2,
random_state=42)
xg_reg = xgb.XGBRegressor(objective ='reg:linear',
colsample_bytree = 0.3, # 각나무마다 사용하는 feature 비율
learning_rate = 0.1,
max_depth = 4,
alpha = 0.1,
n_estimators = 100) # n_estimators=100
xg_reg
XGBRegressor(alpha=0.1, base_score=None, booster=None, callbacks=None, colsample_bylevel=None, colsample_bynode=None, colsample_bytree=0.3, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, gamma=None, gpu_id=None, grow_policy=None, importance_type=None, interaction_constraints=None, learning_rate=0.1, max_bin=None, max_cat_to_onehot=None, max_delta_step=None, max_depth=4, max_leaves=None, min_child_weight=None, missing=nan, monotone_constraints=None, n_estimators=100, n_jobs=None, num_parallel_tree=None, objective='reg:linear', predictor=None, random_state=None, ...)
xg_reg.fit(X_train, y_train)
[16:15:33] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror.
XGBRegressor(alpha=0.1, base_score=0.5, booster='gbtree', callbacks=None, colsample_bylevel=1, colsample_bynode=1, colsample_bytree=0.3, early_stopping_rounds=None, enable_categorical=False, eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise', importance_type=None, interaction_constraints='', learning_rate=0.1, max_bin=256, max_cat_to_onehot=4, max_delta_step=0, max_depth=4, max_leaves=0, min_child_weight=1, missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0, num_parallel_tree=1, objective='reg:linear', predictor='auto', random_state=0, ...)
# 결정계수 확인
print("학습용 데이터 결정계수: {:.3f}".format(xg_reg.score(X_train, y_train)))
print("테스트 데이터 결정계수: {:.3f}".format(xg_reg.score(X_test, y_test)))
학습용 데이터 결정계수: 0.940 테스트 데이터 결정계수: 0.930
%%time
num_list = [100, 200, 300, 500, 1000, 1500, 2000, 3000]
for num in num_list:
xg_reg = xgb.XGBRegressor(objective ='reg:linear',
colsample_bytree = 0.3, # 각나무마다 사용하는 feature 비율
learning_rate = 0.1,
max_depth = 3,
alpha = 0.1,
n_estimators = num)
xg_reg.fit(X_train, y_train)
pred = xg_reg.predict(X_test)
# MAE, MSE, RMSE, RMLSE
mae_val = ( np.abs(pred - y_test) ).sum() / len(pred)
mse_val = ( (pred - y_test)**2 ).sum() / len(pred)
rmse_val = np.sqrt( mse_val )
rmlse_val = rmlse(pred, y_test)
print("MAE : {:.3f}".format( mae_val ))
print("MSE : {:.3f}".format( mse_val ))
print("RMSE : {:.3f}".format( rmse_val ))
print("RMLSE : {:.3f}".format( rmlse_val ))
print()
[16:15:34] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. MAE : 39.436 MSE : 3376.439 RMSE : 58.107 RMLSE : 0.562 [16:15:35] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. MAE : 34.143 MSE : 2515.522 RMSE : 50.155 RMLSE : 0.550 [16:15:36] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. MAE : 31.513 MSE : 2189.830 RMSE : 46.796 RMLSE : 0.516 [16:15:39] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. MAE : 28.937 MSE : 1894.832 RMSE : 43.530 RMLSE : 0.468 [16:15:43] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. MAE : 26.922 MSE : 1665.535 RMSE : 40.811 RMLSE : 0.472 [16:15:50] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. MAE : 26.258 MSE : 1597.554 RMSE : 39.969 RMLSE : 0.478 [16:16:03] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. MAE : 25.982 MSE : 1566.315 RMSE : 39.577 RMLSE : 0.470 [16:16:21] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. MAE : 25.899 MSE : 1556.697 RMSE : 39.455 RMLSE : 0.468 CPU times: user 3min 52s, sys: 2.51 s, total: 3min 54s Wall time: 1min 19s
xg_reg = xgb.XGBRegressor(objective ='reg:linear',
colsample_bytree = 0.3, # 각나무마다 사용하는 feature 비율
learning_rate = 0.05,
max_depth = 5,
alpha = 0.1,
n_estimators = 3000)
xg_reg.fit(X_train, y_train)
print("학습용 세트 정확도: {:.3f}".format(xg_reg.score(X_train, y_train)))
print("테스트 세트 정확도: {:.3f}".format(xg_reg.score(X_test, y_test)))
# MAE, MSE, RMSE, RMLSE
mae_val = ( np.abs(pred - y_test) ).sum() / len(pred)
mse_val = ( (pred - y_test)**2 ).sum() / len(pred)
rmse_val = np.sqrt( mse_val )
rmlse_val = rmlse(pred, y_test)
print("MAE : {:.3f}".format( mae_val ))
print("MSE : {:.3f}".format( mse_val ))
print("RMSE : {:.3f}".format( rmse_val ))
print("RMLSE : {:.3f}".format( rmlse_val ))
[16:16:53] WARNING: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-10.9-x86_64-3.7/xgboost/src/objective/regression_obj.cu:203: reg:linear is now deprecated in favor of reg:squarederror. 학습용 세트 정확도: 0.998 테스트 세트 정확도: 0.959 MAE : 25.899 MSE : 1556.697 RMSE : 39.455 RMLSE : 0.468
ex_X_test = PolynomialFeatures(degree=2,
include_bias=False).fit_transform(X_test_all)
pred = xg_reg.predict(ex_X_test) # 예측
sub['count'] = pred
sub.loc[sub['count'] < 0, 'count'] = 0
sub.head(3)
datetime | count | |
---|---|---|
0 | 2011-01-20 00:00:00 | 16.371508 |
1 | 2011-01-20 01:00:00 | 0.342722 |
2 | 2011-01-20 02:00:00 | 0.000000 |
# 처음 만는 제출용 csv 파일, 행번호를 없애기
sub.to_csv("five_xgb_sub.csv", index=False)
교육용으로 작성된 것으로 배포 및 복제시에 사전 허가가 필요합니다.
Copyright 2021 LIM Co. all rights reserved.