728x90
작업형 예시 / 문제 : Classification / 평가 : auc
In [60]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier as RFC
from xgboost import XGBClassifier as XGB
from lightgbm import LGBMClassifier as LGB
from sklearn.metrics import roc_curve, auc
1) selection¶
In [61]:
train = pd.read_csv("C:/Users/KANG/PYTHON/kaggle/01_aug_train.csv")
X_test = pd.read_csv("C:/Users/KANG/PYTHON/kaggle/01_aug_test.csv")
In [62]:
X_train = train.drop(columns='target').copy()
y_train = train['target']
In [63]:
# print(X_train.head())
# print(X_test.head())
# print(y_train.head())
# print(X_train.info())
# print(X_test.info())
# print(y_train.info())
# print(X_train.describe())
# print(X_test.describe())
# print(y_train.describe())
2) preprocessing¶
In [64]:
# 1. key variable
enrollee_id = X_test['enrollee_id'].copy()
X_train = X_train.drop(columns='enrollee_id')
X_test = X_test.drop(columns='enrollee_id')
In [65]:
# 2. missing values
X_train.isna().sum()
X_test.isna().sum()
Out[65]:
city 0 city_development_index 0 gender 508 relevent_experience 0 enrolled_university 31 education_level 52 major_discipline 312 experience 5 company_size 622 company_type 634 last_new_job 40 training_hours 0 dtype: int64
In [66]:
# About 10% over data is deleted
cond_na1000 = (X_train.isna().sum() > 1000)
colum_na1000 = X_train.columns[cond_na1000]
X_train = X_train.drop(colum_na1000, axis=1)
X_test = X_test.drop(colum_na1000, axis=1)
In [67]:
# Replacing values
# The null value of the categorical variable : the highest frequency.
# enrolled_university
mode_EU = X_train['enrolled_university'].value_counts().idxmax()
X_train['enrolled_university'] = X_train['enrolled_university'].fillna(mode_EU)
X_test['enrolled_university'] = X_test['enrolled_university'].fillna(mode_EU)
In [68]:
# education_level
mode_EL = X_train['education_level'].value_counts().idxmax()
X_train['education_level'] = X_train['education_level'].fillna(mode_EL)
X_test['education_level'] = X_test['education_level'].fillna(mode_EL)
In [69]:
# 3. category variable : N/A
# 4. numeric variable : N/A
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=1234, test_size = 0.2)
# print(X_train.shape)
# print(X_val.shape)
# print(y_train.shape)
# print(y_val.shape)
In [70]:
# city 카테고리가 train에만 존재하는 데이터가 존재함
# onehot할 경우 handle_unknown='ignore'
X_train.select_dtypes('object').value_counts()
X_test.select_dtypes('object').value_counts()
Out[70]:
city relevent_experience enrolled_university education_level experience last_new_job city_103 Has relevent experience no_enrollment Graduate >20 >4 40 1 19 Masters >20 >4 19 city_114 Has relevent experience no_enrollment Masters >20 >4 14 city_103 Has relevent experience no_enrollment Graduate >20 2 12 .. city_134 No relevent experience no_enrollment Phd 6 never 1 High School 4 never 1 Graduate 6 1 1 2 never 1 city_99 No relevent experience no_enrollment Graduate >20 >4 1 Length: 1576, dtype: int64
3) transformation¶
In [71]:
# encoding
X_train_cat = X_train.select_dtypes('object').copy()
X_val_cat = X_val.select_dtypes('object').copy()
X_test_cat = X_test.select_dtypes('object').copy()
enc = OneHotEncoder(sparse=False, handle_unknown='ignore').fit(X_train_cat)
X_TRAIN_OH = enc.transform(X_train_cat)
X_VAL_OH = enc.transform(X_val_cat)
X_TEST_OH = enc.transform(X_test_cat)
In [72]:
# scale
X_train_conti = X_train.select_dtypes(exclude='object').copy()
X_val_conti = X_val.select_dtypes(exclude='object').copy()
X_test_conti = X_test.select_dtypes(exclude='object').copy()
scale = StandardScaler().fit(X_train_conti)
X_TRAIN_STD = scale.transform(X_train_conti)
X_VAL_STD = scale.transform(X_val_conti)
X_TEST_STD = scale.transform(X_test_conti)
In [73]:
X_TRAIN = np.concatenate([X_TRAIN_OH, X_TRAIN_STD], axis = 1)
X_VAL = np.concatenate([X_VAL_OH, X_VAL_STD], axis = 1)
y_TRAIN = y_train.values.ravel()
y_VAL = y_val.values.ravel()
4) modeling¶
In [74]:
import warnings
warnings.filterwarnings(action='ignore')
In [75]:
rf = RFC(n_estimators= 500,
max_depth = 3,
min_samples_leaf = 10,
max_features = 'sqrt',
random_state = 2022)
model_rf = rf.fit(X_TRAIN, y_TRAIN)
xgb = XGB(max_depth = 8,
n_estimators = 500,
nthread = 5,
min_child_weight = 20,
gamma = 0.5,
objective = 'binary:logistic',
use_label_encoder = False,
random_state = 2022,
eval_metric = 'mlogloss')
model_xgb = xgb.fit(X_TRAIN, y_TRAIN)
lgb = LGB(max_depth= 8,
n_estimators = 500,
n_jobs = 30,
min_child_weight = 10,
learning_rate = 0.2,
objective = 'binary',
random_state = 2022)
model_lgb = lgb.fit(X_TRAIN, y_TRAIN)
5) evaluation¶
In [76]:
score_rf = model_rf.predict_proba(X_VAL)[:,1]
score_xgb = model_xgb.predict_proba(X_VAL)[:,1]
score_lgb = model_lgb.predict_proba(X_VAL)[:,1]
fpr, tpr, thresholds = roc_curve(y_VAL, score_rf)
auc_rf = auc(fpr, tpr)
print(auc_rf)
fpr, tpr, thresholds = roc_curve(y_VAL, score_xgb)
auc_xgb = auc(fpr, tpr)
print(auc_xgb)
fpr, tpr, thresholds = roc_curve(y_VAL, score_lgb)
auc_lgb = auc(fpr, tpr)
print(auc_lgb)
0.741305427464155 0.7497823429445684 0.7379727269899397
제출¶
In [77]:
X_TEST = np.concatenate([X_TEST_OH, X_TEST_STD], axis = 1)
y_score = model_rf.predict_proba(X_TEST)[:,1]
y_pred = model_rf.predict(X_TEST)
obj = {'enrollee_id': enrollee_id,
'target' : y_pred,
'target_prob' : y_score}
result = pd.DataFrame(obj)
result.to_csv("12345.csv", index = False)
참고 : 실제 값과 비교하여 얼마나 잘 맞추는가¶
In [78]:
# actual = pd.read_csv("test.csv")
# actual = actual['target'].ravel()
# fpr, tpr, thresholds = roc_curve(actual, y_score, pos_label = 1)
# auc(fpr, tpr)
728x90
'자격증 > [빅데이터분석기사]' 카테고리의 다른 글
[실기] 빅데이터분석기사 실기 작업 흐름 (0) | 2023.01.16 |
---|---|
[실기] 빅데이터분석기사 실기 - 작업형 2 빈출 (0) | 2023.01.16 |
댓글