728x90
In [6]:
import numpy as np
import pandas as pd
numerical
In [11]:
num_df = pd.DataFrame(np.random.randint(0, 100, size=(10,5)), columns = list('ABCDE'))
num_df
Out[11]:
A | B | C | D | E | |
---|---|---|---|---|---|
0 | 5 | 9 | 66 | 69 | 87 |
1 | 18 | 87 | 31 | 59 | 42 |
2 | 30 | 69 | 31 | 13 | 38 |
3 | 90 | 96 | 19 | 59 | 91 |
4 | 34 | 95 | 27 | 56 | 69 |
5 | 28 | 36 | 26 | 34 | 53 |
6 | 57 | 55 | 96 | 6 | 16 |
7 | 48 | 30 | 19 | 8 | 58 |
8 | 6 | 72 | 65 | 76 | 61 |
9 | 96 | 72 | 0 | 77 | 93 |
categorical
In [12]:
cat_df = pd.DataFrame({"Color": ['Red','Blue','Green','Yellow'], 'Brand':['nike','samsung','naver','kakao']})
cat_df
Out[12]:
Color | Brand | |
---|---|---|
0 | Red | nike |
1 | Blue | samsung |
2 | Green | naver |
3 | Yellow | kakao |
Missing values
In [15]:
nan_data = {'example': [np.nan, np.nan, 10 ,20, 30, 50, 100,np.nan,np.nan,1, 2, 3, 4, 5, np.nan, np.nan, np.nan]}
nan_df = pd.DataFrame(nan_data,columns=['example'])
In [18]:
nan_df.isnull().values.any()
Out[18]:
True
In [19]:
nan_df['example'].isnull().values.any()
Out[19]:
True
In [31]:
clean_df = nan_df.fillna(0); clean_df.head(5)
Out[31]:
example | |
---|---|
0 | 0.0 |
1 | 0.0 |
2 | 10.0 |
3 | 20.0 |
4 | 30.0 |
In [25]:
nan_df.fillna(method='ffill').head(5)
Out[25]:
example | |
---|---|
0 | NaN |
1 | NaN |
2 | 10.0 |
3 | 20.0 |
4 | 30.0 |
In [30]:
nan_df.fillna(method='bfill').head(5)
Out[30]:
example | |
---|---|
0 | 10.0 |
1 | 10.0 |
2 | 10.0 |
3 | 20.0 |
4 | 30.0 |
In [27]:
df_drop = nan_df.dropna()
df_drop.head()
Out[27]:
example | |
---|---|
2 | 10.0 |
3 | 20.0 |
4 | 30.0 |
5 | 50.0 |
6 | 100.0 |
In [28]:
df_drop.describe()
Out[28]:
example | |
---|---|
count | 10.000000 |
mean | 22.500000 |
std | 31.433351 |
min | 1.000000 |
25% | 3.250000 |
50% | 7.500000 |
75% | 27.500000 |
max | 100.000000 |
Encoding
In [32]:
pd.get_dummies(cat_df)
Out[32]:
Color_Blue | Color_Green | Color_Red | Color_Yellow | Brand_kakao | Brand_naver | Brand_nike | Brand_samsung | |
---|---|---|---|---|---|---|---|---|
0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 |
1 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
2 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 |
3 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 |
Converting Data Types
In [33]:
clean_df.dtypes
Out[33]:
example float64
dtype: object
In [34]:
df_string = clean_df.astype('string')
df_string.dtypes
Out[34]:
example string
dtype: object
In [35]:
df_int = clean_df.astype('int')
df_int.dtypes
Out[35]:
example int64
dtype: object
728x90
'Data Analytics with python > [Data Analysis]' 카테고리의 다른 글
[Visualization] seaborn (0) | 2023.01.22 |
---|---|
[Visualization] matplotlib (0) | 2023.01.22 |
[Text]S8_08_Word_Cloud (0) | 2023.01.21 |
[Text]S8_07_Text_visualization (0) | 2023.01.21 |
[Text]S8_06_Text_tokenization (0) | 2023.01.21 |
댓글