import pandas as pd
train = pd.read_csv("bike/train.csv", parse_dates=['datetime'])
test = pd.read_csv("bike/test.csv", parse_dates=['datetime'])
train.columns
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'casual', 'registered', 'count'], dtype='object')
test.columns
Index(['datetime', 'season', 'holiday', 'workingday', 'weather', 'temp', 'atemp', 'humidity', 'windspeed'], dtype='object')
train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10886 entries, 0 to 10885 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 10886 non-null datetime64[ns] 1 season 10886 non-null int64 2 holiday 10886 non-null int64 3 workingday 10886 non-null int64 4 weather 10886 non-null int64 5 temp 10886 non-null float64 6 atemp 10886 non-null float64 7 humidity 10886 non-null int64 8 windspeed 10886 non-null float64 9 casual 10886 non-null int64 10 registered 10886 non-null int64 11 count 10886 non-null int64 dtypes: datetime64[ns](1), float64(3), int64(8) memory usage: 1020.7 KB
test.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 6493 entries, 0 to 6492 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 datetime 6493 non-null datetime64[ns] 1 season 6493 non-null int64 2 holiday 6493 non-null int64 3 workingday 6493 non-null int64 4 weather 6493 non-null int64 5 temp 6493 non-null float64 6 atemp 6493 non-null float64 7 humidity 6493 non-null int64 8 windspeed 6493 non-null float64 dtypes: datetime64[ns](1), float64(3), int64(5) memory usage: 456.7 KB
import matplotlib.pyplot as plt
import matplotlib
train.datetime.describe()
<ipython-input-13-120836598240>:1: FutureWarning: Treating datetime data as categorical rather than numeric in `.describe` is deprecated and will be removed in a future version of pandas. Specify `datetime_is_numeric=True` to silence this warning and adopt the future behavior now. train.datetime.describe()
count 10886 unique 10886 top 2011-06-09 04:00:00 freq 1 first 2011-01-01 00:00:00 last 2012-12-19 23:00:00 Name: datetime, dtype: object
plt.plot(train['count'], train['temp'], 'o', alpha=0.2)
[<matplotlib.lines.Line2D at 0x27bcac1db20>]
train.corr()['count']
season 0.163439 holiday -0.005393 workingday 0.011594 weather -0.128655 temp 0.394454 atemp 0.389784 humidity -0.317371 windspeed 0.101369 casual 0.690414 registered 0.970948 count 1.000000 Name: count, dtype: float64
train.corr()['count'].abs().sort_values(ascending=False)
count 1.000000 registered 0.970948 casual 0.690414 temp 0.394454 atemp 0.389784 humidity 0.317371 season 0.163439 weather 0.128655 windspeed 0.101369 workingday 0.011594 holiday 0.005393 Name: count, dtype: float64
data = train.corr()['count'].abs().sort_values(ascending=True)
print(data.index)
print(data.values)
Index(['holiday', 'workingday', 'windspeed', 'weather', 'season', 'humidity', 'atemp', 'temp', 'casual', 'registered', 'count'], dtype='object') [0.00539298 0.01159387 0.10136947 0.1286552 0.16343902 0.31737148 0.38978444 0.39445364 0.69041357 0.97094811 1. ]
plt.barh(data.index, data.values)
<BarContainer object of 11 artists>
plt.barh(data.index, data.values)
plt.title("count corr values")
plt.xlabel("corr values")
plt.ylabel("features")
Text(0, 0.5, 'features')
train['season'].value_counts()
4 2734 3 2733 2 2733 1 2686 Name: season, dtype: int64
data = train['season'].value_counts()
plt.bar(data.index, data.values)
<BarContainer object of 4 artists>
plt.bar(data.index.astype(str), data.values)
<BarContainer object of 4 artists>
train['holiday'].value_counts()
0 10575 1 311 Name: holiday, dtype: int64
data = train['holiday'].value_counts()
plt.bar(data.index.astype(str), data.values)
<BarContainer object of 2 artists>
train['weather'].value_counts()
1 7192 2 2834 3 859 4 1 Name: weather, dtype: int64
data = train['weather'].value_counts()
plt.bar(data.index.astype(str), data.values)
<BarContainer object of 4 artists>
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
plt.hist(train.temp)
plt.subplot(2,2,2)
plt.hist(train.atemp)
plt.subplot(2,2,3)
plt.hist(train.humidity)
plt.subplot(2,2,4)
plt.hist(train.windspeed)
(array([1.313e+03, 4.083e+03, 2.827e+03, 1.540e+03, 6.960e+02, 2.800e+02, 1.070e+02, 3.100e+01, 6.000e+00, 3.000e+00]), array([ 0. , 5.69969, 11.39938, 17.09907, 22.79876, 28.49845, 34.19814, 39.89783, 45.59752, 51.29721, 56.9969 ]), <BarContainer object of 10 artists>)
plt.figure(figsize=(15,10))
plt.hist(train.temp, alpha=0.3)
plt.hist(train.atemp, alpha=0.3)
plt.hist(train.humidity, alpha=0.3)
plt.hist(train.windspeed, alpha=0.3)
(array([1.313e+03, 4.083e+03, 2.827e+03, 1.540e+03, 6.960e+02, 2.800e+02, 1.070e+02, 3.100e+01, 6.000e+00, 3.000e+00]), array([ 0. , 5.69969, 11.39938, 17.09907, 22.79876, 28.49845, 34.19814, 39.89783, 45.59752, 51.29721, 56.9969 ]), <BarContainer object of 10 artists>)
from matplotlib import font_manager, rc
import matplotlib.pyplot as plt
import platform
path = "C:/Windows/Fonts/malgun.ttf"
if platform.system() == "Windows":
font_name = font_manager.FontProperties(fname=path).get_name()
rc('font', family=font_name)
elif platform.system()=="Darwin":
rc('font', family='AppleGothic')
else:
print("Unknown System")
matplotlib.rcParams['axes.unicode_minus'] = False
plt.figure(figsize=(15,10))
plt.subplot(2,2,1)
plt.hist(train.temp)
plt.xlabel("temp", size=17)
plt.subplot(2,2,2)
plt.hist(train.atemp, color="#88c999")
plt.xlabel("atemp", size=17)
plt.subplot(2,2,3)
plt.hist(train.humidity, color='#B652BE')
plt.xlabel("humidity", size=17)
plt.subplot(2,2,4)
plt.hist(train.windspeed)
plt.xlabel("windspeed", size=17)
plt.suptitle("피처의 값의 분포", size=20)
Text(0.5, 0.98, '피처의 값의 분포')
print( train['weather'].count() )
all_cnt = train['weather'].count()
print( train['weather'].value_counts() / all_cnt )
10886 1 0.660665 2 0.260334 3 0.078909 4 0.000092 Name: weather, dtype: float64
plt.figure(figsize=(10,10))
dat = train['weather'].value_counts() / all_cnt
dat.index=['봄', '여름', '가을', '겨울']
plt.pie(dat.values, labels=dat.index)
plt.legend(title='계절')
<matplotlib.legend.Legend at 0x27bd09309a0>