import pandas as pd
import seaborn as sns
import numpy as np

print(pd.__version__)
iris = sns.load_dataset("iris")
iris

2.1.4

print(iris.columns)

# sepal_length 열 선택
# sepal_width에서 petal_width열 선택

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

# sepal_length 열 선택
iris['sepal_length']

0      5.1
1      4.9
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: sepal_length, Length: 150, dtype: float64

# sepal_width에서 petal_width열 선택 - (1) (다중 컬럼 선택)
iris[ ['sepal_length', 'petal_length', 'petal_width']   ]

# sepal_width에서 petal_width열 선택 - 
# (2) (다중 컬럼 선택) - [].loc[행전체선택 , 시작:끝]
iris.loc[ : , 'sepal_length':'petal_width'  ]

# width에 해당하는 컬럼만 반복을 통해 가져올 수 있음.
# : 전후로 생략 가능
iris.iloc[0:3, 0:2]

iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

for one in iris.columns:
    print("컬럼명 : ", one)
    print( iris[one].value_counts() )

컬럼명 :  sepal_length
sepal_length
5.0    10
5.1     9
6.3     9
5.7     8
6.7     8
5.8     7
5.5     7
6.4     7
4.9     6
5.4     6
6.1     6
6.0     6
5.6     6
4.8     5
6.5     5
6.2     4
7.7     4
6.9     4
4.6     4
5.2     4
5.9     3
4.4     3
7.2     3
6.8     3
6.6     2
4.7     2
7.6     1
7.4     1
7.3     1
7.0     1
7.1     1
5.3     1
4.3     1
4.5     1
7.9     1
Name: count, dtype: int64
컬럼명 :  sepal_width
sepal_width
3.0    26
2.8    14
3.2    13
3.4    12
3.1    11
2.9    10
2.7     9
2.5     8
3.5     6
3.3     6
3.8     6
2.6     5
2.3     4
3.6     4
3.7     3
2.4     3
2.2     3
3.9     2
4.4     1
4.0     1
4.1     1
4.2     1
2.0     1
Name: count, dtype: int64
컬럼명 :  petal_length
petal_length
1.4    13
1.5    13
5.1     8
4.5     8
1.6     7
1.3     7
5.6     6
4.7     5
4.9     5
4.0     5
4.2     4
5.0     4
4.4     4
4.8     4
1.7     4
3.9     3
4.6     3
5.7     3
4.1     3
5.5     3
6.1     3
5.8     3
3.3     2
5.4     2
6.7     2
5.3     2
5.9     2
6.0     2
1.2     2
4.3     2
1.9     2
3.5     2
5.2     2
3.0     1
1.1     1
3.7     1
3.8     1
6.6     1
6.3     1
1.0     1
6.9     1
3.6     1
6.4     1
Name: count, dtype: int64
컬럼명 :  petal_width
petal_width
0.2    29
1.3    13
1.8    12
1.5    12
1.4     8
2.3     8
1.0     7
0.4     7
0.3     7
2.1     6
2.0     6
0.1     5
1.2     5
1.9     5
1.6     4
2.5     3
2.2     3
2.4     3
1.1     3
1.7     2
0.6     1
0.5     1
Name: count, dtype: int64
컬럼명 :  species
species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

 [len(iris[one].unique()) for one in  iris.columns]

[35, 23, 43, 22, 3]

[ iris[one].unique() for one in  iris.columns]

[array([5.1, 4.9, 4.7, 4.6, 5. , 5.4, 4.4, 4.8, 4.3, 5.8, 5.7, 5.2, 5.5,
        4.5, 5.3, 7. , 6.4, 6.9, 6.5, 6.3, 6.6, 5.9, 6. , 6.1, 5.6, 6.7,
        6.2, 6.8, 7.1, 7.6, 7.3, 7.2, 7.7, 7.4, 7.9]),
 array([3.5, 3. , 3.2, 3.1, 3.6, 3.9, 3.4, 2.9, 3.7, 4. , 4.4, 3.8, 3.3,
        4.1, 4.2, 2.3, 2.8, 2.4, 2.7, 2. , 2.2, 2.5, 2.6]),
 array([1.4, 1.3, 1.5, 1.7, 1.6, 1.1, 1.2, 1. , 1.9, 4.7, 4.5, 4.9, 4. ,
        4.6, 3.3, 3.9, 3.5, 4.2, 3.6, 4.4, 4.1, 4.8, 4.3, 5. , 3.8, 3.7,
        5.1, 3. , 6. , 5.9, 5.6, 5.8, 6.6, 6.3, 6.1, 5.3, 5.5, 6.7, 6.9,
        5.7, 6.4, 5.4, 5.2]),
 array([0.2, 0.4, 0.3, 0.1, 0.5, 0.6, 1.4, 1.5, 1.3, 1.6, 1. , 1.1, 1.8,
        1.2, 1.7, 2.5, 1.9, 2.1, 2.2, 2. , 2.4, 2.3]),
 array(['setosa', 'versicolor', 'virginica'], dtype=object)]

print(iris.columns)

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

iris['species'].unique()

array(['setosa', 'versicolor', 'virginica'], dtype=object)

print(iris.columns)

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

iris['species'].value_counts()

species
setosa        50
versicolor    50
virginica     50
Name: count, dtype: int64

[x for x in iris.columns if 'sepal' in x]

['sepal_length', 'sepal_width']

[x for x in iris.columns if 'species' not in x]

['sepal_length', 'sepal_width', 'petal_length', 'petal_width']

# width에 해당하는 컬럼만 반복을 통해 가져올 수 있음.
iris.loc[:3, [x for x in iris.columns if 'sepal' in x] ]

#iris[ 조건식 ] => 조건에 만족하는 행 추출
iris[ iris['species']=='versicolor' ]

# 조건을 만족하는 행 추출. loc 이용
iris.loc[ iris['species']=='versicolor' ]

# 조건을 만족하는 행 추출. loc 이용
iris.loc[ iris['species']=='versicolor', : ]

iris.sepal_length.mean()

5.843333333333334

# iris[ (조건1) & (조건2) ]
iris_tmp = iris[ (iris['species']=='versicolor') & 
                 (iris['sepal_length'] > 5.843 ) ]
iris_tmp

### 조건식을 이용하여 데이터를 추출하고, index를 초기화 시켜 보자.
iris_versi = iris[ iris['species']=='versicolor' ].reset_index(drop=True)
iris_versi

iris_tmp = iris_tmp.reset_index(drop=True)
print( iris_tmp.shape )
iris_tmp.head()

(26, 5)

corr_iris = iris.iloc[ : , 0:4 ].corr()
corr_iris

sns.heatmap(corr_iris, annot=True, fmt=".2f")

<Axes: >

sns.pairplot(iris, hue='species')

C:\Users\daniel_wj\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\daniel_wj\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\daniel_wj\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
C:\Users\daniel_wj\anaconda3\Lib\site-packages\seaborn\_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

<seaborn.axisgrid.PairGrid at 0x2684d822710>

iris.head()

iris.sort_values(by='sepal_length', ascending=False)

iris.sort_values(by=['sepal_length', 'sepal_width'], 
                 ascending=False)

iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB

iris['species'] = iris['species'].astype('category')
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   sepal_length  150 non-null    float64 
 1   sepal_width   150 non-null    float64 
 2   petal_length  150 non-null    float64 
 3   petal_width   150 non-null    float64 
 4   species       150 non-null    category
dtypes: category(1), float64(4)
memory usage: 5.1 KB

iris_new = iris.copy()
iris_new

iris_new.iloc[ 2:4, 2:3] = np.nan
iris_new

iris_new.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    2
petal_width     0
species         0
dtype: int64

mean_val = iris_new['petal_length'].mean()
iris_new['petal_length'] = iris_new['petal_length'].fillna(mean_val)
iris_new.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

	sepal_length	sepal_width	petal_length	petal_width
sepal_length	1.000000	-0.117570	0.871754	0.817941
sepal_width	-0.117570	1.000000	-0.428440	-0.366126
petal_length	0.871754	-0.428440	1.000000	0.962865
petal_width	0.817941	-0.366126	0.962865	1.000000

Pandas 라이브러리 IRIS 데이터 셋 실습해보기¶

학습 내용¶

목차

01 데이터 준비

02. 행,열 선택

실습¶

실습해 보기 1¶

03. 중복값을 제외한 값 확인 - [ ].unique()

iris의 꽃의 종류 - 중복 제외하고 어떤 값이 있는지 확인할 수 있을까?¶

04. 중복값을 제외한 값의 빈도수 확인 - [ ].value_counts()

05. 조건식을 이용하여 해당 하는 컬럼 가져오기

실습해보기 - species를 제외한 컬럼을 출력하기¶

조건을 두고 versicolor 행만 추출해 보자.¶

두개의 조건 - setosa 중에 sepal_length이 평균 이상인 것들만 추출해보기.¶

실습해 보기 2¶

06. reset_ index로 인덱스를 초기화 시키기

(실습) iris_tmp의 행의 index를 초기화 시키고, 총 몇 행인지 확인해 보자.¶

네개의 특성에 대한 상관계수 구해보기¶

히트맵¶

실습해 보기 3¶

07. sort_values()를 이용한 정렬

sepal_length로 정렬하기¶

08. astype를 이용한 데이터 자료형 변환

object를 category로 변경하기¶

09. 결측치 채우기 - fillna()

결측값을 평균값으로 채우기¶

실습해 보기 4¶

	sepal_length	sepal_width	petal_length	petal_width	species
0	5.1	3.5	1.4	0.2	setosa
1	4.9	3.0	1.4	0.2	setosa
2	4.7	3.2	1.3	0.2	setosa
3	4.6	3.1	1.5	0.2	setosa
4	5.0	3.6	1.4	0.2	setosa
...	...	...	...	...	...
145	6.7	3.0	5.2	2.3	virginica
146	6.3	2.5	5.0	1.9	virginica
147	6.5	3.0	5.2	2.0	virginica
148	6.2	3.4	5.4	2.3	virginica
149	5.9	3.0	5.1	1.8	virginica

	sepal_length	petal_length	petal_width
0	5.1	1.4	0.2
1	4.9	1.4	0.2
2	4.7	1.3	0.2
3	4.6	1.5	0.2
4	5.0	1.4	0.2
...	...	...	...
145	6.7	5.2	2.3
146	6.3	5.0	1.9
147	6.5	5.2	2.0
148	6.2	5.4	2.3
149	5.9	5.1	1.8

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8

	sepal_length	sepal_width	petal_length	petal_width	species
50	7.0	3.2	4.7	1.4	versicolor
51	6.4	3.2	4.5	1.5	versicolor
52	6.9	3.1	4.9	1.5	versicolor
53	5.5	2.3	4.0	1.3	versicolor
54	6.5	2.8	4.6	1.5	versicolor
55	5.7	2.8	4.5	1.3	versicolor
56	6.3	3.3	4.7	1.6	versicolor
57	4.9	2.4	3.3	1.0	versicolor
58	6.6	2.9	4.6	1.3	versicolor
59	5.2	2.7	3.9	1.4	versicolor
60	5.0	2.0	3.5	1.0	versicolor
61	5.9	3.0	4.2	1.5	versicolor
62	6.0	2.2	4.0	1.0	versicolor
63	6.1	2.9	4.7	1.4	versicolor
64	5.6	2.9	3.6	1.3	versicolor
65	6.7	3.1	4.4	1.4	versicolor
66	5.6	3.0	4.5	1.5	versicolor
67	5.8	2.7	4.1	1.0	versicolor
68	6.2	2.2	4.5	1.5	versicolor
69	5.6	2.5	3.9	1.1	versicolor
70	5.9	3.2	4.8	1.8	versicolor
71	6.1	2.8	4.0	1.3	versicolor
72	6.3	2.5	4.9	1.5	versicolor
73	6.1	2.8	4.7	1.2	versicolor
74	6.4	2.9	4.3	1.3	versicolor
75	6.6	3.0	4.4	1.4	versicolor
76	6.8	2.8	4.8	1.4	versicolor
77	6.7	3.0	5.0	1.7	versicolor
78	6.0	2.9	4.5	1.5	versicolor
79	5.7	2.6	3.5	1.0	versicolor
80	5.5	2.4	3.8	1.1	versicolor
81	5.5	2.4	3.7	1.0	versicolor
82	5.8	2.7	3.9	1.2	versicolor
83	6.0	2.7	5.1	1.6	versicolor
84	5.4	3.0	4.5	1.5	versicolor
85	6.0	3.4	4.5	1.6	versicolor
86	6.7	3.1	4.7	1.5	versicolor
87	6.3	2.3	4.4	1.3	versicolor
88	5.6	3.0	4.1	1.3	versicolor
89	5.5	2.5	4.0	1.3	versicolor
90	5.5	2.6	4.4	1.2	versicolor
91	6.1	3.0	4.6	1.4	versicolor
92	5.8	2.6	4.0	1.2	versicolor
93	5.0	2.3	3.3	1.0	versicolor
94	5.6	2.7	4.2	1.3	versicolor
95	5.7	3.0	4.2	1.2	versicolor
96	5.7	2.9	4.2	1.3	versicolor
97	6.2	2.9	4.3	1.3	versicolor
98	5.1	2.5	3.0	1.1	versicolor
99	5.7	2.8	4.1	1.3	versicolor

	sepal_length	petal_length	petal_width
0	5.1	1.4	0.2
1	4.9	1.4	0.2
2	4.7	1.3	0.2
3	4.6	1.5	0.2
4	5.0	1.4	0.2
...	...	...	...
145	6.7	5.2	2.3
146	6.3	5.0	1.9
147	6.5	5.2	2.0
148	6.2	5.4	2.3
149	5.9	5.1	1.8

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8

	sepal_length	petal_length	petal_width
0	5.1	1.4	0.2
1	4.9	1.4	0.2
2	4.7	1.3	0.2
3	4.6	1.5	0.2
4	5.0	1.4	0.2
...	...	...	...
145	6.7	5.2	2.3
146	6.3	5.0	1.9
147	6.5	5.2	2.0
148	6.2	5.4	2.3
149	5.9	5.1	1.8

	sepal_length	sepal_width	petal_length	petal_width
0	5.1	3.5	1.4	0.2
1	4.9	3.0	1.4	0.2
2	4.7	3.2	1.3	0.2
3	4.6	3.1	1.5	0.2
4	5.0	3.6	1.4	0.2
...	...	...	...	...
145	6.7	3.0	5.2	2.3
146	6.3	2.5	5.0	1.9
147	6.5	3.0	5.2	2.0
148	6.2	3.4	5.4	2.3
149	5.9	3.0	5.1	1.8