-
(머신러닝)Kaggle의 Santander Customer Satisfaction 데이터 분석Machine Learning 2019. 11. 18. 12:14
Santander Bank Customer Satisfaction Analysis
1. If target is 1, it represents customers who had unsatisfaction with service.
2. If target is 0, it represents customers who had satisfaction for service.
3. I will use ROC-AUC because unsatisfaction outcomes would be much smaller. 4. XGBoost and LightGBM will be used
from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.metrics import roc_auc_score from xgboost import XGBClassifier, plot_importance from lightgbm import LGBMClassifier import numpy as np import pandas as pd import matplotlib.pyplot as plt import matplotlib %matplotlib inline
cust_df = pd.read_csv('train_santander.csv', encoding='latin-1') print('dataset shape:', cust_df.shape) cust_df.head(3)
dataset shape: (76020, 371)
ID var3 var15 imp_ent_var16_ult1 imp_op_var39_comer_ult1 imp_op_var39_comer_ult3 imp_op_var40_comer_ult1 imp_op_var40_comer_ult3 imp_op_var40_efect_ult1 imp_op_var40_efect_ult3 ... saldo_medio_var33_hace2 saldo_medio_var33_hace3 saldo_medio_var33_ult1 saldo_medio_var33_ult3 saldo_medio_var44_hace2 saldo_medio_var44_hace3 saldo_medio_var44_ult1 saldo_medio_var44_ult3 var38 TARGET 0 1 2 23 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 39205.17 0 1 3 2 34 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 49278.03 0 2 4 2 23 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 67333.77 0 3 rows × 371 columns
cust_df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 76020 entries, 0 to 76019 Columns: 371 entries, ID to TARGET dtypes: float64(111), int64(260) memory usage: 215.2 MB
print(cust_df['TARGET'].value_counts()) unsatisfied_cnt = cust_df[cust_df['TARGET'] == 1].TARGET.count() total_cnt = cust_df.TARGET.count() print('unsatisfied ratio is {0:.2f}'.format((unsatisfied_cnt / total_cnt)))
0 73012 1 3008 Name: TARGET, dtype: int64 unsatisfied ratio is 0.04
cust_df.describe()
ID var3 var15 imp_ent_var16_ult1 imp_op_var39_comer_ult1 imp_op_var39_comer_ult3 imp_op_var40_comer_ult1 imp_op_var40_comer_ult3 imp_op_var40_efect_ult1 imp_op_var40_efect_ult3 ... saldo_medio_var33_hace2 saldo_medio_var33_hace3 saldo_medio_var33_ult1 saldo_medio_var33_ult3 saldo_medio_var44_hace2 saldo_medio_var44_hace3 saldo_medio_var44_ult1 saldo_medio_var44_ult3 var38 TARGET count 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 ... 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 76020.000000 7.602000e+04 76020.000000 mean 75964.050723 -1523.199277 33.212865 86.208265 72.363067 119.529632 3.559130 6.472698 0.412946 0.567352 ... 7.935824 1.365146 12.215580 8.784074 31.505324 1.858575 76.026165 56.614351 1.172358e+05 0.039569 std 43781.947379 39033.462364 12.956486 1614.757313 339.315831 546.266294 93.155749 153.737066 30.604864 36.513513 ... 455.887218 113.959637 783.207399 538.439211 2013.125393 147.786584 4040.337842 2852.579397 1.826646e+05 0.194945 min 1.000000 -999999.000000 5.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 5.163750e+03 0.000000 25% 38104.750000 2.000000 23.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 6.787061e+04 0.000000 50% 76043.000000 2.000000 28.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.064092e+05 0.000000 75% 113748.750000 2.000000 40.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.187563e+05 0.000000 max 151838.000000 238.000000 105.000000 210000.000000 12888.030000 21024.810000 8237.820000 11073.570000 6600.000000 6600.000000 ... 50003.880000 20385.720000 138831.630000 91778.730000 438329.220000 24650.010000 681462.900000 397884.300000 2.203474e+07 1.000000 8 rows × 371 columns
cust_df['var3'].replace(-999999, 2, inplace=True) cust_df.drop('ID', axis=1, inplace=True) # 피처 세트와 레이블 세트 분리, 레이블 칼럼은 DataFrame의 맨 마지막에 위치해 칼럼 위치 -1로 분리 X_features = cust_df.iloc[:, :-1] y_labels = cust_df.iloc[:, -1] print(X_features.shape)
(76020, 369)
비대칭한 데이터 셋이므로 클래스인 Target값 분포도가 학습데이터와 테스트데이터 셋에 모두 비슷하게 추출됐는지 확인!
X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels, test_size=0.2, random_state=0) train_cnt = y_train.count() test_cnt = y_test.count() print(y_train.shape) print(X_train.shape, X_test.shape) print(y_train.value_counts()) print(train_cnt) print(y_train.value_counts()/train_cnt) print(y_test.value_counts()/test_cnt)
(60816,) (60816, 369) (15204, 369) 0 58442 1 2374 Name: TARGET, dtype: int64 60816 0 0.960964 1 0.039036 Name: TARGET, dtype: float64 0 0.9583 1 0.0417 Name: TARGET, dtype: float64
위에 결과를 보면, y_train 카운트를 하면 전체 low중에서 80퍼센트를 출력하고, y_train.value_counts를 하면 0의 값과 1의 값을 나눠서 카운트 한다.
그래서 전체 개수로 나눠주면 퍼센트가 나오는데, train셋이랑 test셋이 서로 0과1의 비율 0.4의 불만족 값'1'로 비슷하다.
xgb_clf = XGBClassifier(n_estimators=500, random_state=156) xgb_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=[(X_train, y_train), (X_test, y_test)]) xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1], average='macro') print(xgb_roc_score)
n_estimator를 100으로 줄이고, early_stopping_rounds도 30으로 줄여서 일단 테스트 하고 나중에 올려도 된다. 하이퍼 파리미터 튜닝을 해야되기 때문에. 하이퍼 파리미터를 변경했을 때 ROC_AUC 수치를 확인한다.
xgb_clf = XGBClassifier(n_estimators=100) params = {'max_depth':[5, 7], 'min_chlid_weight':[1, 3], 'colsample_bytree':[0.5, 0.75]} # 하이퍼 파라미터 테스트의 수행속도를 향상시키기 위해 cv를 지정하지 않음. gridcv = GridSearchCV(xgb_clf, param_grid=params) gridcv.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=[(X_train, y_trian), (X_test, y_test)]) print('GridSearchCV 최적 파라미터:', girdcv.best_params_) xgb_roc_score = roc_auc_score(y_test, gridcv.predict_proba(X_test)[:, 1], average='macro') print(xgb_roc_score)
하이퍼 파라미터를 더 추가해서 최적화 진행!
xgb_clf = XGBClassifier(n_estimators=100, random_state=156, learning_rate=0.02, max_depth=5, min_child_weight=1, colsample_bytree=0.75, reg_alpha=0.03) # n_estimators : 결정트리의 개수, weak learner의 개수 # min_child_weight : min_child_leaf랑 유사함(똑같지는 않음), # colsample_bytree : max_features와 유사, 트리 생성에 필요한 피처를 임의로 샘플링, # learning_rate : 학습을 진행할 때마다 적용하는 학습률. 0에서 1까지 조정. 작을 수록 오래걸림. weak learner가 여러번 반복하는 것임 # n_estimator와 상호보완적임. xgb_clf.fit(X_train, y_train, early_stopping_rounds=200, eval_metric="auc", eval_set=[(X_train, y_train), (X_test, y_test)]) xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1], average='macro') print(xgb_roc_score)
fig, ax = plt.subplots(1, 1, figsize=(10, 8)) plot_importance(xgb_clf, ax=ax, max_num_features=20, height=0.4)
lgbm_clf = LGBMClassifier(n_estimators=500) evals = [(X_test, y_test)] lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals, verbose=True) lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:, 1], average='macro') print(lgbm_roc_score)
# 하이퍼 파라미터 테스트의 수행 속도를 향상시키기 위해 n_estimators를 100으로 감소 LGBM_clf = LGBMClassifier(n_estimators=200) params = {'num_leaves': [32, 64 ], 'max_depth':[128, 160], 'min_child_samples':[60, 100], 'subsample':[0.8, 1]} # 하이퍼 파라미터 테스트의 수행속도를 향상 시키기 위해 cv 를 지정하지 않습니다. gridcv = GridSearchCV(lgbm_clf, param_grid=params) gridcv.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc", eval_set=[(X_train, y_train), (X_test, y_test)]) print('GridSearchCV 최적 파라미터:', gridcv.best_params_) lgbm_roc_score = roc_auc_score(y_test, gridcv.predict_proba(X_test)[:,1], average='macro') print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))
lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=32, sumbsample=0.8, min_child_samples=100, max_depth=128) evals = [(X_test, y_test)] lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals, verbose=True) lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1],average='macro') print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))
'Machine Learning' 카테고리의 다른 글
AdaBoost란 무엇인가? (1) 2020.08.12