728x90

In [24]:

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.datasets import fetch_california_housing
%matplotlib inline

# boston data
dataset = fetch_california_housing()

In [26]:

df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target

In [29]:

print('california_housing 데이터 세트 크기 : ', df.shape)

california_housing 데이터 세트 크기 :  (20640, 9)

In [31]:

df.head()

Out[31]:

	MedInc	HouseAge	AveRooms	AveBedrms	Population	AveOccup	Latitude	Longitude	target
0	8.3252	41.0	6.984127	1.023810	322.0	2.555556	37.88	-122.23	4.526
1	8.3014	21.0	6.238137	0.971880	2401.0	2.109842	37.86	-122.22	3.585
2	7.2574	52.0	8.288136	1.073446	496.0	2.802260	37.85	-122.24	3.521
3	5.6431	52.0	5.817352	1.073059	558.0	2.547945	37.85	-122.25	3.413
4	3.8462	52.0	6.281853	1.081081	565.0	2.181467	37.85	-122.25	3.422

In [32]:

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   MedInc      20640 non-null  float64
 1   HouseAge    20640 non-null  float64
 2   AveRooms    20640 non-null  float64
 3   AveBedrms   20640 non-null  float64
 4   Population  20640 non-null  float64
 5   AveOccup    20640 non-null  float64
 6   Latitude    20640 non-null  float64
 7   Longitude   20640 non-null  float64
 8   target      20640 non-null  float64
dtypes: float64(9)
memory usage: 1.4 MB

In [42]:

# seaborn의 regplot() API는 산점도 + 선형 회귀 직선
fig, axs = plt.subplots(figsize = (16,8), ncols = 4, nrows = 2) # axs는 4*2개의 ax를 가짐
lm_features = df.select_dtypes(include = 'float64').iloc[:,:8]
for i, feature in enumerate(lm_features):
    row = int(i/4)
    col = i%4 # 나머지
    sns.regplot(x = feature, y = 'target', data = df, ax = axs[row][col])

LinearRegression

In [43]:

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

y_target = df['target']
X_data = df.drop(['target'], axis = 1, inplace = False)

X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size = 0.3, random_state = 42)

# fit
lr = LinearRegression()
lr.fit(X_train, y_train)

# predict
y_preds = lr.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)

# result
print('MSE : {0:.3f}, RMSE : {1:.3f}'.format(mse, rmse))
print('Variance score : {0:.3f}'.format(r2_score(y_test, y_preds)))

MSE : 0.531, RMSE : 0.728
Variance score : 0.596

In [45]:

# 절편 : intercept_
# 회귀계수 : coefficients_
print('절편 : ', lr.intercept_)
print('회귀 계수 : ', lr.coef_)

절편 :  -37.05624133152537
회귀 계수 :  [ 4.45822565e-01  9.68186799e-03 -1.22095112e-01  7.78599557e-01
 -7.75740400e-07 -3.37002667e-03 -4.18536747e-01 -4.33687976e-01]

In [46]:

# 피처별 회귀계수 값 
coef_sr = pd.Series(data = np.round(lr.coef_, 1), index = X_data.columns)
coef_sr.sort_values(ascending= False)

Out[46]:

AveBedrms     0.8
MedInc        0.4
HouseAge      0.0
Population   -0.0
AveOccup     -0.0
AveRooms     -0.1
Latitude     -0.4
Longitude    -0.4
dtype: float64

교차 검증

In [47]:

from sklearn.model_selection import cross_val_score

y_target = df['target']
X_data = df.drop(['target'], axis = 1, inplace = False)
lr = LinearRegression()

# 5폴드 세트로 MSE를 구한 후 이를 기반으로 다시 RMSE
neg_mse_scores = cross_val_score(lr, X_data, y_target, scoring = "neg_mean_squared_error", cv = 5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)

# MSE계산에서 반환된 값은 모두 음수
print('5 folds의 개별 Negative MSE scores :', np.round(neg_mse_scores, 2))
print('5 folds의 개별 RMSE scores :', np.round(rmse_scores, 2))
print('5 folds의 평균 RMSE : {0:.3f}'.format(avg_rmse))

5 folds의 개별 Negative MSE scores : [-0.48 -0.62 -0.65 -0.54 -0.49]
5 folds의 개별 RMSE scores : [0.7  0.79 0.8  0.74 0.7 ]
5 folds의 평균 RMSE : 0.746

728x90

'Data Analytics with python > [Machine Learning ]' 카테고리의 다른 글

[연관 규칙 데이터 정제] 데이터를 정제하여 apriori 알고리즘을 수행하기 위한 준비 (0)	2023.02.03
[연관 규칙 분석] Association_rules 분석 (0)	2023.02.03
[학습 01] 주택 가격 예측하기 (1)	2023.02.01
[이미지 분할] Image segmentation (1)	2023.01.26
[분류 모델 평가 지표] Confusion Matrix (0)	2023.01.24

Kang's Note

[회귀 구현] data : california_housing

'Data Analytics with python > [Machine Learning ]' 카테고리의 다른 글

댓글

티스토리툴바

[회귀 구현] data : california_housing

'Data Analytics with python > [Machine Learning ]' 카테고리의 다른 글

관련글

댓글

티스토리툴바