(머신러닝)Kaggle의 Santander Customer Satisfaction 데이터 분석

Machine Learning 2019. 11. 18. 12:14

Santander Bank Customer Satisfaction Analysis

1. If target is 1, it represents customers who had unsatisfaction with service.

2. If target is 0, it represents customers who had satisfaction for service.

3. I will use ROC-AUC because unsatisfaction outcomes would be much smaller. 4. XGBoost and LightGBM will be used

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier, plot_importance
from lightgbm import LGBMClassifier
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

cust_df = pd.read_csv('train_santander.csv', encoding='latin-1')
print('dataset shape:', cust_df.shape)
cust_df.head(3)

dataset shape: (76020, 371)

	ID	var3	var15	...	var38
0	1	2	23	...	39205.17
1	3	2	34	...	49278.03
2	4	2	23	...	67333.77

3 rows × 371 columns

cust_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB

print(cust_df['TARGET'].value_counts())
unsatisfied_cnt = cust_df[cust_df['TARGET'] == 1].TARGET.count()
total_cnt = cust_df.TARGET.count()
print('unsatisfied ratio is {0:.2f}'.format((unsatisfied_cnt / total_cnt)))

0    73012
1     3008
Name: TARGET, dtype: int64
unsatisfied ratio is 0.04

cust_df.describe()

	ID	var3	var15	imp_ent_var16_ult1	imp_op_var39_comer_ult1	imp_op_var39_comer_ult3	imp_op_var40_comer_ult1	imp_op_var40_comer_ult3	imp_op_var40_efect_ult1	imp_op_var40_efect_ult3	...	saldo_medio_var33_hace2	saldo_medio_var33_hace3	saldo_medio_var33_ult1	saldo_medio_var33_ult3	saldo_medio_var44_hace2	saldo_medio_var44_hace3	saldo_medio_var44_ult1	saldo_medio_var44_ult3	var38	TARGET
count	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	...	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	76020.000000	7.602000e+04	76020.000000
mean	75964.050723	-1523.199277	33.212865	86.208265	72.363067	119.529632	3.559130	6.472698	0.412946	0.567352	...	7.935824	1.365146	12.215580	8.784074	31.505324	1.858575	76.026165	56.614351	1.172358e+05	0.039569
std	43781.947379	39033.462364	12.956486	1614.757313	339.315831	546.266294	93.155749	153.737066	30.604864	36.513513	...	455.887218	113.959637	783.207399	538.439211	2013.125393	147.786584	4040.337842	2852.579397	1.826646e+05	0.194945
min	1.000000	-999999.000000	5.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	5.163750e+03	0.000000
25%	38104.750000	2.000000	23.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	6.787061e+04	0.000000
50%	76043.000000	2.000000	28.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.064092e+05	0.000000
75%	113748.750000	2.000000	40.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	...	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.187563e+05	0.000000
max	151838.000000	238.000000	105.000000	210000.000000	12888.030000	21024.810000	8237.820000	11073.570000	6600.000000	6600.000000	...	50003.880000	20385.720000	138831.630000	91778.730000	438329.220000	24650.010000	681462.900000	397884.300000	2.203474e+07	1.000000

8 rows × 371 columns

cust_df['var3'].replace(-999999, 2, inplace=True)
cust_df.drop('ID', axis=1, inplace=True)

# 피처 세트와 레이블 세트 분리, 레이블 칼럼은 DataFrame의 맨 마지막에 위치해 칼럼 위치 -1로 분리
X_features = cust_df.iloc[:, :-1]
y_labels = cust_df.iloc[:, -1]
print(X_features.shape)

(76020, 369)

비대칭한 데이터 셋이므로 클래스인 Target값 분포도가 학습데이터와 테스트데이터 셋에 모두 비슷하게 추출됐는지 확인!

X_train, X_test, y_train, y_test = train_test_split(X_features, y_labels,
                                                   test_size=0.2, random_state=0)
train_cnt = y_train.count()
test_cnt = y_test.count()
print(y_train.shape)
print(X_train.shape, X_test.shape)

print(y_train.value_counts())
print(train_cnt)
print(y_train.value_counts()/train_cnt)
print(y_test.value_counts()/test_cnt)

(60816,)
(60816, 369) (15204, 369)
0    58442
1     2374
Name: TARGET, dtype: int64
60816
0    0.960964
1    0.039036
Name: TARGET, dtype: float64
0    0.9583
1    0.0417
Name: TARGET, dtype: float64

위에 결과를 보면, y_train 카운트를 하면 전체 low중에서 80퍼센트를 출력하고, y_train.value_counts를 하면 0의 값과 1의 값을 나눠서 카운트 한다.

그래서 전체 개수로 나눠주면 퍼센트가 나오는데, train셋이랑 test셋이 서로 0과1의 비율 0.4의 불만족 값'1'로 비슷하다.

xgb_clf = XGBClassifier(n_estimators=500, random_state=156)

xgb_clf.fit(X_train, y_train, early_stopping_rounds=100,
           eval_metric="auc", eval_set=[(X_train, y_train), (X_test, y_test)])

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1], average='macro')
print(xgb_roc_score)

n_estimator를 100으로 줄이고, early_stopping_rounds도 30으로 줄여서 일단 테스트 하고 나중에 올려도 된다. 하이퍼 파리미터 튜닝을 해야되기 때문에. 하이퍼 파리미터를 변경했을 때 ROC_AUC 수치를 확인한다.

xgb_clf = XGBClassifier(n_estimators=100)

params = {'max_depth':[5, 7], 'min_chlid_weight':[1, 3], 'colsample_bytree':[0.5, 0.75]}

# 하이퍼 파라미터 테스트의 수행속도를 향상시키기 위해 cv를 지정하지 않음.
gridcv = GridSearchCV(xgb_clf, param_grid=params)
gridcv.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc",
          eval_set=[(X_train, y_trian), (X_test, y_test)])

print('GridSearchCV 최적 파라미터:', girdcv.best_params_)

xgb_roc_score = roc_auc_score(y_test, gridcv.predict_proba(X_test)[:, 1], average='macro')
print(xgb_roc_score)

하이퍼 파라미터를 더 추가해서 최적화 진행!

xgb_clf = XGBClassifier(n_estimators=100, random_state=156, learning_rate=0.02, max_depth=5,
                       min_child_weight=1, colsample_bytree=0.75, reg_alpha=0.03)
# n_estimators : 결정트리의 개수, weak learner의 개수
# min_child_weight : min_child_leaf랑 유사함(똑같지는 않음), 
# colsample_bytree : max_features와 유사, 트리 생성에 필요한 피처를 임의로 샘플링, 
# learning_rate : 학습을 진행할 때마다 적용하는 학습률. 0에서 1까지 조정. 작을 수록 오래걸림. weak learner가 여러번 반복하는 것임
#                 n_estimator와 상호보완적임.

xgb_clf.fit(X_train, y_train, early_stopping_rounds=200,
            eval_metric="auc", eval_set=[(X_train, y_train), (X_test, y_test)])

xgb_roc_score = roc_auc_score(y_test, xgb_clf.predict_proba(X_test)[:, 1], average='macro')
print(xgb_roc_score)

fig, ax = plt.subplots(1, 1, figsize=(10, 8))
plot_importance(xgb_clf, ax=ax, max_num_features=20, height=0.4)

lgbm_clf = LGBMClassifier(n_estimators=500)

evals = [(X_test, y_test)]
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals, verbose=True)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:, 1], average='macro')
print(lgbm_roc_score)

# 하이퍼 파라미터 테스트의 수행 속도를 향상시키기 위해 n_estimators를 100으로 감소
LGBM_clf = LGBMClassifier(n_estimators=200)

params = {'num_leaves': [32, 64 ],
          'max_depth':[128, 160],
          'min_child_samples':[60, 100],
          'subsample':[0.8, 1]}


# 하이퍼 파라미터 테스트의 수행속도를 향상 시키기 위해 cv 를 지정하지 않습니다. 
gridcv = GridSearchCV(lgbm_clf, param_grid=params)
gridcv.fit(X_train, y_train, early_stopping_rounds=30, eval_metric="auc",
           eval_set=[(X_train, y_train), (X_test, y_test)])

print('GridSearchCV 최적 파라미터:', gridcv.best_params_)
lgbm_roc_score = roc_auc_score(y_test, gridcv.predict_proba(X_test)[:,1], average='macro')
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

lgbm_clf = LGBMClassifier(n_estimators=1000, num_leaves=32, sumbsample=0.8, min_child_samples=100,
                          max_depth=128)

evals = [(X_test, y_test)]
lgbm_clf.fit(X_train, y_train, early_stopping_rounds=100, eval_metric="auc", eval_set=evals,
                verbose=True)

lgbm_roc_score = roc_auc_score(y_test, lgbm_clf.predict_proba(X_test)[:,1],average='macro')
print('ROC AUC: {0:.4f}'.format(lgbm_roc_score))

저작자표시 (새창열림)

'Machine Learning' 카테고리의 다른 글

AdaBoost란 무엇인가? (1)	2020.08.12

ABOUT ME

Insighting Data Insighting Data

Santander Bank Customer Satisfaction Analysis

'Machine Learning' 카테고리의 다른 글

티스토리툴바

ABOUT ME

Santander Bank Customer Satisfaction Analysis

'Machine Learning' 카테고리의 다른 글

관련글 관련글 더보기

티스토리툴바