注:参考多篇csdn及b站文章所得
一、实验背景
某机构想要预测哪些客户可能会产生贷款违约行为。他们搜集了历史客户行为的部分数据以及目标客户的信息,希望通过历史数据对目标客户进行预测哪些客户会是潜在的违约客户,从而缩小目标范围,实现低风险贷款发放。
搜集到的数据以.CSV存储,分别包括历史客户和目标客户两个文件。数据描述如下:
字段名
字段描述
数据类型
income
客户收入
int
age
客户年龄
int
experience
工作年限
int
profession
职业
string
married
婚否
string
house_ownership
有房/租房/其它
string
car_ownership
是否有车
string
risk_flag
是否拖欠贷款
string
currentjobyears
现有工作年限
int
currenthouseyears
在当前住所居住时长
int
city
居住城市
string
state
居住州/邦
string
二、实验内容
基于分类方法,根据客户历史行为预测潜在的贷款客户
三、实验步骤
1.导入数据
import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snsimport datetimeimport warningsfrom sklearn.preprocessing import LabelEncoderwarnings.filterwarnings('ignore')#一、数据导入app_train=pd.read_csv('D:/班级作业/数据挖掘/实验报告/实验二/archive/historic customer behavior.csv')app_test =pd.read_csv('D:/班级作业/数据挖掘/实验报告/实验二/archive/target customer.csv')
2.数据探索性分析
#二、数据探索性分析# 画图代码def plot_stats(feature, label_rotation=False, horizontal_layout=True): temp = app_train[feature].value_counts() df1 = pd.Dataframe({feature: temp.index, 'Number of contracts': temp.values}) # 计算每个属性类别中Risk_Flag=1的个数 cat_perc = app_train[[feature, 'Risk_Flag']].groupby([feature], as_index=False).mean() cat_perc.sort_values(by='Risk_Flag', ascending=False, inplace=True) if (horizontal_layout): fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 6)) else: fig, (ax1, ax2) = plt.subplots(nrows=2, figsize=(12, 14)) sns.set_color_codes("pastel") s = sns.barplot(ax=ax1, x=feature, y="Number of contracts", data=df1) if (label_rotation): s.set_xticklabels(s.get_xticklabels(), rotation=90) s = sns.barplot(ax=ax2, x=feature, y='Risk_Flag', order=cat_perc[feature], data=cat_perc) if (label_rotation): s.set_xticklabels(s.get_xticklabels(), rotation=90) plt.ylabel('Percent of Risk_Flag with value 1 [%]', fontsize=10) plt.tick_params(axis='both', which='major', labelsize=10) plt.gcf().subplots_adjust(left=0.05,top=0.91,bottom=0.09) plt.show();def plot_distribution(var): i = 0 t1 = app_train.loc[app_train['Risk_Flag'] != 0] t0 = app_train.loc[app_train['Risk_Flag'] == 0] sns.set_style('whitegrid') plt.figure() fig, ax = plt.subplots(2, 2, figsize=(6, 6)) for feature in var: i += 1 plt.subplot(2, 2, i) sns.kdeplot(t1[feature], bw=0.5, label="Risk_Flag = 1") sns.kdeplot(t0[feature], bw=0.5, label="Risk_Flag = 0") plt.ylabel('Density plot', fontsize=12) plt.xlabel(feature, fontsize=12) locs, labels = plt.xticks() plt.tick_params(axis='both', which='major', labelsize=12) plt.show();plt.figure(figsize = (5, 6))#1.年龄的影响# 按时偿还贷款的KDE(kdeplot,核密度估计图)图sns.kdeplot(app_train.loc[app_train['Risk_Flag'] == 0, 'Age'], label='Risk_Flag == 0')# 没有按时偿还贷款的KDE(kdeplot,核密度估计图)图sns.kdeplot(app_train.loc[app_train['Risk_Flag'] == 1, 'Age'], label='Risk_Flag == 1')# 标签设置plt.xlabel('Age (years)'); plt.ylabel('Density'); plt.title('Distribution of Ages');plt.gcf().subplots_adjust(left=0.05,right=0.91, top=0.9, bottom=0.09)plt.show()#年龄的影响plot_stats('Age',False,False)#2.收入的影响plot_stats('Income',False,False)#3.房车的影响plot_stats('FLAG_OWN_CAR')plot_stats('FLAG_OWN_REALTY')#4.婚否的影响plot_stats('Married/Single',True, True)#5.工作经验的影响plot_stats('Experience',False,False)
1)用户年龄特征探索
数据表处理结果:
图2:年龄与违约行为是否发生的折线图
图3:不同年龄段违约之人所占比例
分析:
由图2可知,违约用户中20-30的年轻用户分布更多,所以可以假设用户年龄越小,违约的可能性越大;
由图3可知,对用户的年龄进行分捅,进一步观察不同年龄段用户的违约概率。发现[20,25],[25,30]的用户违约的可能性最高,其余年龄段违约可能性相近;
2)用户有无房否
分析:由图可知,没有房的人比有房的人违约率更高
3) 用户有无车否
分析:由图可知,没有车的人比有车的人违约率更高,但相差并不大
4)用户婚否
分析:由图可知,未婚的人比已婚的违约率更高
5)用户收入
分析:由图可知看出收入较低的人违约几率较高
6)用户工作经验
分析:由图可知工作时间较短的人违约几率较高
7)热力图
3.特征预处理
构建新特征DAYS_EMPLOYED_PERCENT: 用户工作年限experience/客户年龄
#三、特征预处理from sklearn.model_selection import KFoldfrom sklearn.metrics import roc_auc_scoreimport lightgbm as lgbimport gc#构造新特征app_train_domain = app_train.copy()app_test_domain = app_test.copy()#构建新特征app_train_domain['DAYS_EMPLOYED_PERCENT'] = app_train_domain['Experience'] / app_train_domain['Age']#用户工作时间/年龄#app_train_domain['INCOME_HOUSE'] = app_train_domain['Income'] / app_train_domain['House_Ownership']#用户工作收入/房子拥有有情况#app_train_domain['INCOME_CAR'] = app_train_domain['Income'] / app_train_domain['Car_Ownership']#用户工作收入/车子拥有有情况plt.figure(figsize=(10, 10))# 构造新特性的迭代器for i, feature in enumerate(['DAYS_EMPLOYED_PERCENT']): # 创建子图 plt.subplot(1, 1, i + 1) # 按期还款用户的KDE图 sns.kdeplot(app_train_domain.loc[app_train_domain['Risk_Flag'] == 0, feature], label='Risk_Flag == 0') # plot loans that were not repaid sns.kdeplot(app_train_domain.loc[app_train_domain['Risk_Flag'] == 1, feature], label='Risk_Flag == 1') # 未按期还款用户的KDE图 plt.title('Distribution of %s by Target Value' % feature) plt.xlabel('%s' % feature); plt.ylabel('Density');plt.tight_layout(h_pad=2.5)plt.show()app_test_domain['DAYS_EMPLOYED_PERCENT'] = app_test_domain['Experience'] / app_test_domain['Age']
4. 建模分析
#四、建模from sklearn.model_selection import KFoldfrom sklearn.metrics import roc_auc_scoreimport lightgbm as lgbimport gc#1、LGBMClassifier模型def model(features, test_features, encoding='ohe', n_folds=5): # 提取id train_ids = features['Id'] test_ids = test_features['ID'] # 提起训练集中的id labels = features['Risk_Flag'] # 删除训练集中的id和target features = features.drop(columns=['Id', 'Risk_Flag']) test_features = test_features.drop(columns=['ID']) # One Hot 编码 if encoding == 'ohe': features = pd.get_dummies(features) test_features = pd.get_dummies(test_features) # 连接对齐训练集和测试集中的特征 features, test_features = features.align(test_features, join='inner', axis=1) # 没有分类索引的记录 cat_indices = 'auto' # I整数标签编码 elif encoding == 'le': # 创建编码器 label_encoder = LabelEncoder() # 创建列表,用于储存分类索引 cat_indices = [] # I按列迭代 for i, col in enumerate(features): if features[col].dtype == 'object': # 将分类特征映射到整数 features[col] = label_encoder.fit_transform(np.array(features[col].astype(str)).reshape((-1,))) test_features[col] = label_encoder.transform(np.array(test_features[col].astype(str)).reshape((-1,))) # 记录分类索引 cat_indices.append(i) # 捕获错误,当标签编码方案无效时 else: raise ValueError("Encoding must be either 'ohe' or 'le'") print('Training Data Shape: ', features.shape) print('Testing Data Shape: ', test_features.shape) # 提取训练集特征的名字 feature_names = list(features.columns) # 转换为np数组 features = np.array(features) test_features = np.array(test_features) # 创建K者交叉验证对象 k_fold = KFold(n_splits=n_folds, shuffle=True, random_state=50) # 为重要特征设置空数组 feature_importance_values = np.zeros(len(feature_names)) # 创建测试预测的空数组 test_predictions = np.zeros(test_features.shape[0]) # 创建空数组,为了折叠验证预测 out_of_fold = np.zeros(features.shape[0]) # 创建list,保存验证和训练分数 valid_scores = [] train_scores = [] # 创建K折验证迭代器 for train_indices, valid_indices in k_fold.split(features): # K折训练数据 train_features, train_labels = features[train_indices], labels[train_indices] # K折验证数据 valid_features, valid_labels = features[valid_indices], labels[valid_indices] # 创建LGBMClassifier模型 model = lgb.LGBMClassifier(n_estimators=1000, objective='binary', class_weight='balanced', learning_rate=0.05, reg_alpha=0.1, reg_lambda=0.1, subsample=0.8, n_jobs=-1, random_state=50) # 训练模型 model.fit(train_features, train_labels, eval_metric='auc', eval_set=[(valid_features, valid_labels), (train_features, train_labels)], eval_names=['valid', 'train'], categorical_feature=cat_indices, early_stopping_rounds=100, verbose=200) # 在训练过程中使用了提前停止,使用best_iteration从最佳迭代中获取训练结果 best_iteration = model.best_iteration_ # 记录重要特征 feature_importance_values += model.feature_importances_ / k_fold.n_splits # 预测 test_predictions += model.predict_proba(test_features, num_iteration=best_iteration)[:, 1] / k_fold.n_splits # 在验证集上预测 out_of_fold[valid_indices] = model.predict_proba(valid_features, num_iteration=best_iteration)[:, 1] # 记录最好的分数 valid_score = model.best_score_['valid']['auc'] train_score = model.best_score_['train']['auc'] valid_scores.append(valid_score) train_scores.append(train_score) # 清楚所有变量 gc.enable() del model, train_features, valid_features gc.collect() # 设置提交文件的数据框格式 submission = pd.Dataframe({'ID': test_ids, 'Risk_Flag': test_predictions}) # 把重要特性变成数据框格式 feature_importances = pd.Dataframe({'feature': feature_names, 'importance': feature_importance_values}) # 整体验证评分 valid_auc = roc_auc_score(labels, out_of_fold) # 将总分添加到指标中 valid_scores.append(valid_auc) train_scores.append(np.mean(train_scores)) # 创建验证分数的数据框格式 fold_names = list(range(n_folds)) fold_names.append('overall') # 验证分数的数据框格式 metrics = pd.Dataframe({'fold': fold_names, 'train': train_scores, 'valid': valid_scores}) return submission, feature_importances, metrics#利用自定义函数训练模型,并输出5折交叉验证的结果submission, fi, metrics = model(app_train_domain, app_test_domain)print('baseline metrics')print(metrics)submission.to_csv('my_submission1.csv',index=False)del app_train_domain,app_test_domaingc.collectdef plot_feature_importances(df): # 根据重要性对特征进行排序 df = df.sort_values('importance', ascending=False).reset_index() # 将特性的重要性标准化,使其加起来等于1 df['importance_normalized'] = df['importance'] / df['importance'].sum() # 做一个特征重要性的水平条形图 plt.figure(figsize=(5, 3)) ax = plt.subplot() # 最重要的特征放在最上面 ax.barh(list(reversed(list(df.index[:15]))), df['importance_normalized'].head(15), align='center', edgecolor='k') # 设置y轴标签和刻度 ax.set_yticks(list(reversed(list(df.index[:15])))) ax.set_yticklabels(df['feature'].head(15)) # 设置x轴标签、标题 plt.xlabel('Normalized importance'); plt.title('Feature importances') plt.show() return dffi_sorted = plot_feature_importances(fi)#2、xgboost模型# 定类数据编码from sklearn.preprocessing import OrdinalEncoderenc = OrdinalEncoder()enc.fit(app_train.iloc[:,3:6])enc.categories_app_train['Married/Single'] = app_train['Married/Single'].map({'single': 0, 'married': 1})app_train['House_Ownership'] = app_train['House_Ownership'].map({'norent_noown': 0, 'rented': 1 ,'owned': 2})app_train['Car_Ownership'] = app_train['Car_Ownership'].map({'no': 0, 'yes': 1})app_train# 定量特征转化为标准正态分布from sklearn.preprocessing import StandardScaler#标准化,返回值为标准化后的数据app_train[["Income","Age","Experience","CURRENT_JOB_YRS","CURRENT_HOUSE_YRS"]] = StandardScaler().fit_transform(app_train[["Income","Age","Experience","CURRENT_JOB_YRS","CURRENT_HOUSE_YRS"]])app_train# 划分训练集和测试集from sklearn.model_selection import train_test_splitX = app_train[ ["Income", "Age", "Experience", "Married/Single", "House_Ownership", "Car_Ownership", "CURRENT_JOB_YRS", "CURRENT_HOUSE_YRS"]]y = app_train[["Risk_Flag"]]X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)X_test# xgboost模型训练from xgboost import XGBClassifierprint("---------------------xgboost forest---------------------")xgbc = XGBClassifier(n_estimatores=180,nthread=-1,early_stopping_rounds=200,max_depth=10).fit(X_train, y_train)print("训练集精度:")result = xgbc.score(X_train,y_train)print(result)print("验证集精度:")result = xgbc.score(X_test,y_test)print(result)# 绘制ROC曲线from sklearn.metrics import roc_curve, aucfrom sklearn.model_selection import train_test_split# Compute ROC curve and ROC area for each classy_score = xgbc.fit(X_train, y_train).predict_proba(X_test)fpr,tpr,thresholds = roc_curve(y_test, y_score[:,1]);roc_auc = auc(fpr, tpr)# 确定最佳阈值right_index = (tpr + (1 - fpr) - 1)yuzhi = max(right_index)index = list(right_index).index(max(right_index))tpr_val = tpr[index]fpr_val = fpr[index]# 绘制roc曲线图plt.subplots(figsize=(7,5.5))plt.plot(fpr, tpr, color='darkorange',lw=2, label='ROC curve (area = %0.2f)' % roc_auc)plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')plt.xlim([0.0, 1.0])plt.ylim([0.0, 1.05])plt.xlabel('False Positive Rate')plt.ylabel('True Positive Rate')plt.title('ROC Curve')plt.legend(loc="lower right")plt.grid()plt.show() # 3、随机森林分类器训练 from sklearn.metrics import roc_auc_score from sklearn.ensemble import RandomForestClassifier print("---------------------random forest---------------------") rf = RandomForestClassifier(n_estimators=22, random_state=0, max_depth=20).fit(X_train, y_train) print("训练集精度:") result = rf.score(X_train , y_train) print(result) print("验证集精度:") result = rf.score(X_test, y_test) print(result) # 绘制ROC曲线 from sklearn.metrics import roc_curve, auc from sklearn.model_selection import train_test_split # Compute ROC curve and ROC area for each class y_score = rf.fit(X_train, y_train).predict_proba(X_test) fpr, tpr, thresholds = roc_curve(y_test, y_score[:, 1]); roc_auc = auc(fpr, tpr) # 确定最佳阈值 right_index = (tpr + (1 - fpr) - 1) yuzhi = max(right_index) index = list(right_index).index(max(right_index)) tpr_val = tpr[index] fpr_val = fpr[index] # 绘制roc曲线图 plt.subplots(figsize=(7, 5.5)) plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right") plt.grid() plt.show()
1)利用LightGBM模型进行建模预测
2)利用随机森林模型进行建模预测
3)利用xgboost模型进行建模预测
四、实验总结及结果讨论分析
1、数据探索性分析
可以得出用户年龄、有无房或车以及婚否、收入、工作经验对其欠贷违约率的影响:
1)年龄[20,30]的用户违约的可能性最高,其余年龄段违约可能性相近,也可以近似看成年龄越小,违约率越大
2)没有房/车的人比有房/车的人违约率更高
3)未婚的人比已婚的违约率更高
4)收入较低的人违约几率较高
5)工作时间较短的人违约几率较高
2、特征预处理分析
假设由于客户工作时间短,工作年限少故而引起了欠贷违约,则构建新特征DAYS_EMPLOYED_PERCENT,用户工作年限experience/客户年龄。
3建模预测分析
1)利用LightGBM模型进行建模预测,K折交叉验证得到模型在训练集上的AUC得分为 0.926473 ,在验证集上的AUC得分为 0.885765
2)利用随机森林模型进行建模预测,得到的训练集精度为0.9116326530612245,得到的验证集精度为0.8934920634920634
3)利用xgboost模型进行建模预测,得到的训练集精度为0.9096315192743765,得到的验证集精度为0.8965079365079365
4)比较三种模型可以看出,xgboost模型的验证集精度最大,随机森林模型的验证集精度比xgboost模型略小,训练集精度比其略大。