import pandas as pd
train = pd.read_csv("bike/train.csv", parse_dates=['datetime'])
test = pd.read_csv("bike/test.csv", parse_dates=['datetime'])
train.columns
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'], dtype='object')
test.columns
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed'], dtype='object')
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 10886 non-null datetime64[ns] 1 season 10886 non-null int64 2 holiday 10886 non-null int64 3 workingday 10886 non-null int64 4 weather 10886 non-null int64 5 temp 10886 non-null float64 6 atemp 10886 non-null float64 7 humidity 10886 non-null int64 8 windspeed 10886 non-null float64 9 casual 10886 non-null int64 10 registered 10886 non-null int64 11 count 10886 non-null int64 dtypes: datetime64[ns](1), float64(3), int64(8) memory usage: 1020.7 KB
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6493 entries, 0 to 6492 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 6493 non-null datetime64[ns] 1 season 6493 non-null int64 2 holiday 6493 non-null int64 3 workingday 6493 non-null int64 4 weather 6493 non-null int64 5 temp 6493 non-null float64 6 atemp 6493 non-null float64 7 humidity 6493 non-null int64 8 windspeed 6493 non-null float64 dtypes: datetime64[ns](1), float64(3), int64(5) memory usage: 456.7 KB
import matplotlib.pyplot as plt
import matplotlib
train.datetime.describe()
<ipython-input-12-120836598240>:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now. train.datetime.describe()
count 10886 unique 10886 top 2011-06-09 04:00:00 freq 1 first 2011-01-01 00:00:00 last 2012-12-19 23:00:00 Name: datetime, dtype: object
train.plot(x='datetime', y='count')
<AxesSubplot:xlabel='datetime'>
plt.plot(train['count'], train['temp'], 'o', alpha=0.2)
[<matplotlib.lines.Line2D at 0x21184581400>]
import seaborn as sns
sns.lmplot(x='temp', y='count', data=train,
scatter_kws={"s":50, "alpha":0.1})
<seaborn.axisgrid.FacetGrid at 0x21186aaf340>
train.corr()
season | holiday | workingday | weather | temp | atemp | humidity | windspeed | casual | registered | count | |
---|---|---|---|---|---|---|---|---|---|---|---|
season | 1.000000 | 0.029368 | -0.008126 | 0.008879 | 0.258689 | 0.264744 | 0.190610 | -0.147121 | 0.096758 | 0.164011 | 0.163439 |
holiday | 0.029368 | 1.000000 | -0.250491 | -0.007074 | 0.000295 | -0.005215 | 0.001929 | 0.008409 | 0.043799 | -0.020956 | -0.005393 |
workingday | -0.008126 | -0.250491 | 1.000000 | 0.033772 | 0.029966 | 0.024660 | -0.010880 | 0.013373 | -0.319111 | 0.119460 | 0.011594 |
weather | 0.008879 | -0.007074 | 0.033772 | 1.000000 | -0.055035 | -0.055376 | 0.406244 | 0.007261 | -0.135918 | -0.109340 | -0.128655 |
temp | 0.258689 | 0.000295 | 0.029966 | -0.055035 | 1.000000 | 0.984948 | -0.064949 | -0.017852 | 0.467097 | 0.318571 | 0.394454 |
atemp | 0.264744 | -0.005215 | 0.024660 | -0.055376 | 0.984948 | 1.000000 | -0.043536 | -0.057473 | 0.462067 | 0.314635 | 0.389784 |
humidity | 0.190610 | 0.001929 | -0.010880 | 0.406244 | -0.064949 | -0.043536 | 1.000000 | -0.318607 | -0.348187 | -0.265458 | -0.317371 |
windspeed | -0.147121 | 0.008409 | 0.013373 | 0.007261 | -0.017852 | -0.057473 | -0.318607 | 1.000000 | 0.092276 | 0.091052 | 0.101369 |
casual | 0.096758 | 0.043799 | -0.319111 | -0.135918 | 0.467097 | 0.462067 | -0.348187 | 0.092276 | 1.000000 | 0.497250 | 0.690414 |
registered | 0.164011 | -0.020956 | 0.119460 | -0.109340 | 0.318571 | 0.314635 | -0.265458 | 0.091052 | 0.497250 | 1.000000 | 0.970948 |
count | 0.163439 | -0.005393 | 0.011594 | -0.128655 | 0.394454 | 0.389784 | -0.317371 | 0.101369 | 0.690414 | 0.970948 | 1.000000 |
train.corr()['count']
season 0.163439 holiday -0.005393 workingday 0.011594 weather -0.128655 temp 0.394454 atemp 0.389784 humidity -0.317371 windspeed 0.101369 casual 0.690414 registered 0.970948 count 1.000000 Name: count, dtype: float64
# 절대값과 값 정렬해보기
train.corr()['count'].abs().sort_values(ascending=False)
count 1.000000 registered 0.970948 casual 0.690414 temp 0.394454 atemp 0.389784 humidity 0.317371 season 0.163439 weather 0.128655 windspeed 0.101369 workingday 0.011594 holiday 0.005393 Name: count, dtype: float64
data = train.corr()['count'].abs().sort_values(ascending=True)
print(data.index)
print(data.values)
Index(['holiday', 'workingday', 'windspeed', 'weather', 'season', 'humidity', 'atemp', 'temp', 'casual', 'registered', 'count'], dtype='object') [0.00539298 0.01159387 0.10136947 0.1286552 0.16343902 0.31737148 0.38978444 0.39445364 0.69041357 0.97094811 1. ]
plt.barh(data.index, data.values)
<BarContainer object of 11 artists>
plt.barh(data.index, data.values)
plt.title("count corr values")
plt.xlabel("corr values")
plt.ylabel("features")
Text(0, 0.5, 'features')
# 값과 빈도수
train['season'].value_counts()
4 2734 3 2733 2 2733 1 2686 Name: season, dtype: int64
data = train['season'].value_counts()
plt.bar(data.index, data.values)
<BarContainer object of 4 artists>
plt.bar(data.index.astype(str), data.values)
<BarContainer object of 4 artists>
train['holiday'].value_counts()
0 10575 1 311 Name: holiday, dtype: int64
data = train['holiday'].value_counts()
plt.bar(data.index.astype(str), data.values)
<BarContainer object of 2 artists>
train['weather'].value_counts()
1 7192 2 2834 3 859 4 1 Name: weather, dtype: int64
data = train['weather'].value_counts()
plt.bar(data.index.astype(str), data.values)
<BarContainer object of 4 artists>
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
plt.hist(train.temp)
plt.subplot(2,2,2)
plt.hist(train.atemp)
plt.subplot(2,2,3)
plt.hist(train.humidity)
plt.subplot(2,2,4)
plt.hist(train.windspeed)
(array([1.313e+03, 4.083e+03, 2.827e+03, 1.540e+03, 6.960e+02, 2.800e+02, 1.070e+02, 3.100e+01, 6.000e+00, 3.000e+00]), array([ 0. , 5.69969, 11.39938, 17.09907, 22.79876, 28.49845, 34.19814, 39.89783, 45.59752, 51.29721, 56.9969 ]), <BarContainer object of 10 artists>)
plt.figure(figsize=(15,10))
plt.hist(train.temp, alpha=0.3)
plt.hist(train.atemp, alpha=0.3)
plt.hist(train.humidity, alpha=0.3)
plt.hist(train.windspeed, alpha=0.3)
(array([1.313e+03, 4.083e+03, 2.827e+03, 1.540e+03, 6.960e+02, 2.800e+02, 1.070e+02, 3.100e+01, 6.000e+00, 3.000e+00]), array([ 0. , 5.69969, 11.39938, 17.09907, 22.79876, 28.49845, 34.19814, 39.89783, 45.59752, 51.29721, 56.9969 ]), <BarContainer object of 10 artists>)
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)
elif platform.system()=="Darwin":
rc('font', family='AppleGothic')
else:
print("Unknown System")
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
plt.hist(train.temp)
plt.xlabel("temp", size=17)
plt.subplot(2,2,2)
plt.hist(train.atemp, color="#88c999")
plt.xlabel("atemp", size=17)
plt.subplot(2,2,3)
plt.hist(train.humidity, color='#B652BE')
plt.xlabel("humidity", size=17)
plt.subplot(2,2,4)
plt.hist(train.windspeed)
plt.xlabel("windspeed", size=17)
plt.suptitle("피처의 값의 분포", size=20)
Text(0.5, 0.98, '피처의 값의 분포')
print( train['weather'].count() )
all_cnt = train['weather'].count()
print( train['weather'].value_counts() / all_cnt )
10886 1 0.660665 2 0.260334 3 0.078909 4 0.000092 Name: weather, dtype: float64
plt.figure(figsize=(10,10))
dat = train['weather'].value_counts() / all_cnt
dat.index=['봄', '여름', '가을', '겨울']
plt.pie(dat.values, labels=dat.index)
plt.legend(title='계절')
<matplotlib.legend.Legend at 0x27bd09309a0>