728x90
In [24]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from scipy import stats
from sklearn.datasets import fetch_california_housing
%matplotlib inline
# boston data
dataset = fetch_california_housing()
In [26]:
df = pd.DataFrame(dataset.data, columns=dataset.feature_names)
df['target'] = dataset.target
In [29]:
print('california_housing 데이터 세트 크기 : ', df.shape)
california_housing 데이터 세트 크기 : (20640, 9)
In [31]:
df.head()
Out[31]:
MedInc | HouseAge | AveRooms | AveBedrms | Population | AveOccup | Latitude | Longitude | target | |
---|---|---|---|---|---|---|---|---|---|
0 | 8.3252 | 41.0 | 6.984127 | 1.023810 | 322.0 | 2.555556 | 37.88 | -122.23 | 4.526 |
1 | 8.3014 | 21.0 | 6.238137 | 0.971880 | 2401.0 | 2.109842 | 37.86 | -122.22 | 3.585 |
2 | 7.2574 | 52.0 | 8.288136 | 1.073446 | 496.0 | 2.802260 | 37.85 | -122.24 | 3.521 |
3 | 5.6431 | 52.0 | 5.817352 | 1.073059 | 558.0 | 2.547945 | 37.85 | -122.25 | 3.413 |
4 | 3.8462 | 52.0 | 6.281853 | 1.081081 | 565.0 | 2.181467 | 37.85 | -122.25 | 3.422 |
In [32]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 MedInc 20640 non-null float64
1 HouseAge 20640 non-null float64
2 AveRooms 20640 non-null float64
3 AveBedrms 20640 non-null float64
4 Population 20640 non-null float64
5 AveOccup 20640 non-null float64
6 Latitude 20640 non-null float64
7 Longitude 20640 non-null float64
8 target 20640 non-null float64
dtypes: float64(9)
memory usage: 1.4 MB
In [42]:
# seaborn의 regplot() API는 산점도 + 선형 회귀 직선
fig, axs = plt.subplots(figsize = (16,8), ncols = 4, nrows = 2) # axs는 4*2개의 ax를 가짐
lm_features = df.select_dtypes(include = 'float64').iloc[:,:8]
for i, feature in enumerate(lm_features):
row = int(i/4)
col = i%4 # 나머지
sns.regplot(x = feature, y = 'target', data = df, ax = axs[row][col])
LinearRegression
In [43]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
y_target = df['target']
X_data = df.drop(['target'], axis = 1, inplace = False)
X_train, X_test, y_train, y_test = train_test_split(X_data, y_target, test_size = 0.3, random_state = 42)
# fit
lr = LinearRegression()
lr.fit(X_train, y_train)
# predict
y_preds = lr.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)
# result
print('MSE : {0:.3f}, RMSE : {1:.3f}'.format(mse, rmse))
print('Variance score : {0:.3f}'.format(r2_score(y_test, y_preds)))
MSE : 0.531, RMSE : 0.728
Variance score : 0.596
In [45]:
# 절편 : intercept_
# 회귀계수 : coefficients_
print('절편 : ', lr.intercept_)
print('회귀 계수 : ', lr.coef_)
절편 : -37.05624133152537
회귀 계수 : [ 4.45822565e-01 9.68186799e-03 -1.22095112e-01 7.78599557e-01
-7.75740400e-07 -3.37002667e-03 -4.18536747e-01 -4.33687976e-01]
In [46]:
# 피처별 회귀계수 값
coef_sr = pd.Series(data = np.round(lr.coef_, 1), index = X_data.columns)
coef_sr.sort_values(ascending= False)
Out[46]:
AveBedrms 0.8
MedInc 0.4
HouseAge 0.0
Population -0.0
AveOccup -0.0
AveRooms -0.1
Latitude -0.4
Longitude -0.4
dtype: float64
교차 검증
In [47]:
from sklearn.model_selection import cross_val_score
y_target = df['target']
X_data = df.drop(['target'], axis = 1, inplace = False)
lr = LinearRegression()
# 5폴드 세트로 MSE를 구한 후 이를 기반으로 다시 RMSE
neg_mse_scores = cross_val_score(lr, X_data, y_target, scoring = "neg_mean_squared_error", cv = 5)
rmse_scores = np.sqrt(-1*neg_mse_scores)
avg_rmse = np.mean(rmse_scores)
# MSE계산에서 반환된 값은 모두 음수
print('5 folds의 개별 Negative MSE scores :', np.round(neg_mse_scores, 2))
print('5 folds의 개별 RMSE scores :', np.round(rmse_scores, 2))
print('5 folds의 평균 RMSE : {0:.3f}'.format(avg_rmse))
5 folds의 개별 Negative MSE scores : [-0.48 -0.62 -0.65 -0.54 -0.49]
5 folds의 개별 RMSE scores : [0.7 0.79 0.8 0.74 0.7 ]
5 folds의 평균 RMSE : 0.746
728x90
'Data Analytics with python > [Machine Learning ]' 카테고리의 다른 글
[연관 규칙 데이터 정제] 데이터를 정제하여 apriori 알고리즘을 수행하기 위한 준비 (0) | 2023.02.03 |
---|---|
[연관 규칙 분석] Association_rules 분석 (0) | 2023.02.03 |
[학습 01] 주택 가격 예측하기 (1) | 2023.02.01 |
[이미지 분할] Image segmentation (1) | 2023.01.26 |
[분류 모델 평가 지표] Confusion Matrix (0) | 2023.01.24 |
댓글