Title: | Fully Automatic Generation of Scorecards |
---|---|
Description: | Provides an efficient suite of R tools for scorecard modeling, analysis, and visualization. Including equal frequency binning, equidistant binning, K-means binning, chi-square binning, decision tree binning, data screening, manual parameter modeling, fully automatic generation of scorecards, etc. This package is designed to make scorecard development easier and faster. References include: 1. <http://shichen.name/posts/>. 2. Dong-feng Li(Peking University),Class PPT. 3. <https://zhuanlan.zhihu.com/p/389710022>. 4. <https://www.zhangshengrong.com/p/281oqR9JNw/>. |
Authors: | Tai-Sen Zheng [aut, cre] |
Maintainer: | Tai-Sen Zheng <[email protected]> |
License: | AGPL-3 |
Version: | 0.3.0 |
Built: | 2024-11-04 04:19:09 UTC |
Source: | https://github.com/cran/autoScorecard |
Functions to Automatically Generate Scorecards
auto_scorecard( feature = accepts, key_var = "application_id", y_var = "bad_ind", sample_rate = 0.7, base0 = FALSE, points0 = 600, odds0 = 1/20, pdo = 50, k = 2, max_depth = 3, tree_p = 0.1, missing_rate = 0, single_var_rate = 1, iv_set = 0.02, char_to_number = TRUE, na.omit = TRUE )
auto_scorecard( feature = accepts, key_var = "application_id", y_var = "bad_ind", sample_rate = 0.7, base0 = FALSE, points0 = 600, odds0 = 1/20, pdo = 50, k = 2, max_depth = 3, tree_p = 0.1, missing_rate = 0, single_var_rate = 1, iv_set = 0.02, char_to_number = TRUE, na.omit = TRUE )
feature |
A data.frame with independent variables and target variable. |
key_var |
A name of index variable name. |
y_var |
A name of target variable. |
sample_rate |
Training set sampling percentage. |
base0 |
Whether the scorecard base score is 0. |
points0 |
Base point. |
odds0 |
odds. |
pdo |
Point-to Double Odds. |
k |
Each scale doubles the probability of default several times. |
max_depth |
Set the maximum depth of any node of the final tree, with the root node counted as depth 0. Values greater than 30 rpart will give nonsense results on 32-bit machines. |
tree_p |
Meet the following conversion formula: minbucket = round( p*nrow( df )).Smallest bucket(rpart):Minimum number of observations in any terminal <leaf> node. |
missing_rate |
Data missing rate, variables smaller than this setting will be deleted. |
single_var_rate |
The maximum proportion of a single variable, the variable greater than the setting will be deleted. |
iv_set |
IV value minimum threshold, variable IV value less than the setting will be deleted. |
char_to_number |
Whether to convert character variables to numeric. |
na.omit |
na.omit returns the object with incomplete cases removed. |
A list containing data, bins, scorecards and models.
accepts <- read.csv(system.file("extdata", "accepts.csv", package = "autoScorecard" )) auto_scorecard1 <- auto_scorecard( feature = accepts[1:2000,], key_var= "application_id", y_var = "bad_ind",sample_rate = 0.7, points0 = 600, odds0=1/20, pdo = 50, max_depth = 3, tree_p = 0.1, missing_rate = 0, single_var_rate = 1, iv_set = 0.02, char_to_number = TRUE , na.omit = TRUE)
accepts <- read.csv(system.file("extdata", "accepts.csv", package = "autoScorecard" )) auto_scorecard1 <- auto_scorecard( feature = accepts[1:2000,], key_var= "application_id", y_var = "bad_ind",sample_rate = 0.7, points0 = 600, odds0=1/20, pdo = 50, max_depth = 3, tree_p = 0.1, missing_rate = 0, single_var_rate = 1, iv_set = 0.02, char_to_number = TRUE , na.omit = TRUE)
Calculate the Best IV Value for the Binned Data
best_iv(df, variable, bin, method, label_iv)
best_iv(df, variable, bin, method, label_iv)
df |
A data.frame with independent variables and target variable. |
variable |
Name of variable. |
bin |
Name of bins. |
method |
Name of method. |
label_iv |
Name of IV. |
A data frame of best IV, including the contents of the bin, the upper bound of the bin, the lower bound of the bin, and all the contents returned by the get_IV function.
accepts <- read.csv( system.file( "extdata" , "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) f_1 <-bins_unsupervised( df = feature , id="application_id" , label="bad_ind" , methods = c("k_means", "equal_width","equal_freq" ) , bin_nums=10 ) best1 <- best_iv( df=f_1 ,bin=c('bins') , method = c('method') , variable= c( "variable" ) ,label_iv='miv' )
accepts <- read.csv( system.file( "extdata" , "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) f_1 <-bins_unsupervised( df = feature , id="application_id" , label="bad_ind" , methods = c("k_means", "equal_width","equal_freq" ) , bin_nums=10 ) best1 <- best_iv( df=f_1 ,bin=c('bins') , method = c('method') , variable= c( "variable" ) ,label_iv='miv' )
The Combination of Two Bins Produces the Best Binning Result
best_vs(df1, df2, variable = "variable", label_iv = "miv")
best_vs(df1, df2, variable = "variable", label_iv = "miv")
df1 |
A binned data. |
df2 |
A binned data. |
variable |
A name of X variable. |
label_iv |
A name of target variable. |
A data frame of best IV.
accepts <- read.csv(system.file( "extdata", "accepts.csv", package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) all2 <- bins_tree(df = feature, key_var= "application_id", y_var= "bad_ind" , max_depth = 3, p = 0.1 ) f_1 <-bins_unsupervised( df = feature , id="application_id" , label="bad_ind" , methods = c("k_means", "equal_width","equal_freq" ) , bin_nums=10 ) best1 <- best_iv( df=f_1 ,bin=c('bins') , method = c('method') , variable= c( "variable" ) ,label_iv='miv' ) vs1 <- best_vs( df1 = all2[,-c(3)], df2 = best1[,-c(1:2)] ,variable="variable" ,label_iv='miv' )
accepts <- read.csv(system.file( "extdata", "accepts.csv", package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) all2 <- bins_tree(df = feature, key_var= "application_id", y_var= "bad_ind" , max_depth = 3, p = 0.1 ) f_1 <-bins_unsupervised( df = feature , id="application_id" , label="bad_ind" , methods = c("k_means", "equal_width","equal_freq" ) , bin_nums=10 ) best1 <- best_iv( df=f_1 ,bin=c('bins') , method = c('method') , variable= c( "variable" ) ,label_iv='miv' ) vs1 <- best_vs( df1 = all2[,-c(3)], df2 = best1[,-c(1:2)] ,variable="variable" ,label_iv='miv' )
Equal Frequency Binning
binning_eqfreq(df, feat, label, nbins = 3)
binning_eqfreq(df, feat, label, nbins = 3)
df |
A data.frame with independent variables and target variable. |
feat |
A name of dependent variable. |
label |
A name of target variable. |
nbins |
Number of bins,default:3. |
A data frame, including the contents of the bin, the upper bound of the bin, the lower bound of the bin, and all the contents returned by the get_IV function.
accepts <- read.csv( system.file( "extdata", "accepts.csv", package ="autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) binning_eqfreq1 <- binning_eqfreq( df= feature, feat= 'tot_derog', label = 'bad_ind', nbins = 3)
accepts <- read.csv( system.file( "extdata", "accepts.csv", package ="autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) binning_eqfreq1 <- binning_eqfreq( df= feature, feat= 'tot_derog', label = 'bad_ind', nbins = 3)
Equal Width Binning
binning_eqwid(df, feat, label, nbins = 3)
binning_eqwid(df, feat, label, nbins = 3)
df |
A data.frame with independent variables and target variable. |
feat |
A name of dependent variable. |
label |
A name of target variable. |
nbins |
Number of bins,default:3. |
A data frame, including the contents of the bin, the upper bound of the bin, the lower bound of the bin, and all the contents returned by the get_IV function.
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) binning_eqwid1 <- binning_eqwid( df = feature, feat = 'tot_derog', label = 'bad_ind', nbins = 3 )
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) binning_eqwid1 <- binning_eqwid( df = feature, feat = 'tot_derog', label = 'bad_ind', nbins = 3 )
The K-means Binning The k-means binning method first gives the center number, classifies the observation points using the Euclidean distance calculation and the distance from the center point, and then recalculates the center point until the center point no longer changes, and uses the classification result as the binning of the result.
binning_kmean(df, feat, label, nbins = 3)
binning_kmean(df, feat, label, nbins = 3)
df |
A data.frame with independent variables and target variable. |
feat |
A name of index variable name. |
label |
A name of target variable. |
nbins |
Number of bins,default:3. |
A data frame, including the contents of the bin, the upper bound of the bin, the lower bound of the bin, and all the contents returned by the get_IV function.
accepts <- read.csv( system.file( "extdata" , "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) ddd <- binning_kmean( df = feature, feat= 'loan_term', label = 'bad_ind', nbins = 3)
accepts <- read.csv( system.file( "extdata" , "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) ddd <- binning_kmean( df = feature, feat= 'loan_term', label = 'bad_ind', nbins = 3)
Chi-Square Binning Chi-square binning, using the ChiMerge algorithm for bottom-up merging based on the chi-square test.
bins_chim(df, key_var, y_var, alpha)
bins_chim(df, key_var, y_var, alpha)
df |
A data.frame with independent variables and target variable. |
key_var |
A name of index variable name. |
y_var |
A name of target variable. |
alpha |
Significance level(discretization); |
A data frame, including the contents of the bin, the upper bound of the bin, the lower bound of the bin, and all the contents returned by the get_IV function.
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature2 <- stats::na.omit( accepts[1:200,c(1,3,7:23)] ) all3 <- bins_chim( df = feature2 , key_var = "application_id", y_var = "bad_ind" , alpha=0.1 )
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature2 <- stats::na.omit( accepts[1:200,c(1,3,7:23)] ) all3 <- bins_chim( df = feature2 , key_var = "application_id", y_var = "bad_ind" , alpha=0.1 )
Automatic Binning Based on Decision Tree Automatic Binning Based on Decision Tree(rpart).
bins_tree(df, key_var, y_var, max_depth = 3, p = 0.1)
bins_tree(df, key_var, y_var, max_depth = 3, p = 0.1)
df |
A data.frame with independent variables and target variable. |
key_var |
A name of index variable name. |
y_var |
A name of target variable. |
max_depth |
Set the maximum depth of any node of the final tree, with the root node counted as depth 0. Values greater than 30 rpart will give nonsense results on 32-bit machines. |
p |
Meet the following conversion formula: minbucket = round(p*nrow(df)).Smallest bucket(rpart):Minimum number of observations in any terminal <leaf> node. |
A data frame, including the contents of the bin, the upper bound of the bin, the lower bound of the bin, and all the contents returned by the get_IV function.
accepts <- read.csv(system.file( "extdata", "accepts.csv", package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) all2 <- bins_tree(df = feature, key_var= "application_id", y_var= "bad_ind" , max_depth = 3, p = 0.1 )
accepts <- read.csv(system.file( "extdata", "accepts.csv", package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) all2 <- bins_tree(df = feature, key_var= "application_id", y_var= "bad_ind" , max_depth = 3, p = 0.1 )
Unsupervised Automatic Binning Function By setting bin_nums, perform three unsupervised automatic binning
bins_unsupervised( df, id, label, methods = c("k_means", "equal_width", "equal_freq"), bin_nums )
bins_unsupervised( df, id, label, methods = c("k_means", "equal_width", "equal_freq"), bin_nums )
df |
A data.frame with independent variables and target variable. |
id |
A name of index. |
label |
A name of target variable. |
methods |
Simultaneously calculate three kinds of unsupervised binning("k_means","equal_width","equal_freq" ), the parameters only determine the final output result. |
bin_nums |
Number of bins. |
A data frame, including the contents of the bin, the upper bound of the bin, the lower bound of the bin, and all the contents returned by the get_IV function.
accepts <- read.csv( system.file( "extdata" , "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) f_1 <-bins_unsupervised( df = feature , id="application_id" , label="bad_ind" , methods = c("k_means", "equal_width","equal_freq" ) , bin_nums=10 )
accepts <- read.csv( system.file( "extdata" , "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) f_1 <-bins_unsupervised( df = feature , id="application_id" , label="bad_ind" , methods = c("k_means", "equal_width","equal_freq" ) , bin_nums=10 )
Compare the Distribution of the Two Variable Draw box plots, cdf plot , QQ plots and histograms for two data.
comparison_two(var_A, var_B, name_A, name_B)
comparison_two(var_A, var_B, name_A, name_B)
var_A |
A variable. |
var_B |
A variable. |
name_A |
The name of data A. |
name_B |
The name of data B. |
No return value, called for side effects
accepts <- read.csv(system.file("extdata", "accepts.csv", package = "autoScorecard" )) comparison_two( var_A = accepts$purch_price ,var_B = accepts$tot_rev_line , name_A = 'purch_price' , name_B = "tot_rev_line" )
accepts <- read.csv(system.file("extdata", "accepts.csv", package = "autoScorecard" )) comparison_two( var_A = accepts$purch_price ,var_B = accepts$tot_rev_line , name_A = 'purch_price' , name_B = "tot_rev_line" )
Compare the Distribution of the Two Data
comparison_two_data(df1, df2, key_var, y_var)
comparison_two_data(df1, df2, key_var, y_var)
df1 |
A data. |
df2 |
A data. |
key_var |
A name of index variable name. |
y_var |
A name of target variable. |
No return value, called for side effects
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] comparison_two_data( df1 = train , df2 = test , key_var = c("application_id","account_number"), y_var="bad_ind" )
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] comparison_two_data( df1 = train , df2 = test , key_var = c("application_id","account_number"), y_var="bad_ind" )
Data Description Function
data_detect(df, key_var, y_var)
data_detect(df, key_var, y_var)
df |
A data. |
key_var |
A name of index variable name. |
y_var |
A name of target variable. |
A data frame of data description.
accepts <- read.csv(system.file("extdata", "accepts.csv", package = "autoScorecard" )) aaa <- data_detect( df = accepts, key_var = c("application_id","account_number") , y_var = "bad_ind" )
accepts <- read.csv(system.file("extdata", "accepts.csv", package = "autoScorecard" )) aaa <- data_detect( df = accepts, key_var = c("application_id","account_number") , y_var = "bad_ind" )
Data Filtering
filter_var( df, key_var, y_var, missing_rate, single_var_rate, iv_set, char_to_number = TRUE, na.omit = TRUE )
filter_var( df, key_var, y_var, missing_rate, single_var_rate, iv_set, char_to_number = TRUE, na.omit = TRUE )
df |
A data.frame with independent variables and target variable. |
key_var |
A name of index variable name. |
y_var |
A name of target variable. |
missing_rate |
Data missing rate, variables smaller than this setting will be deleted. |
single_var_rate |
The maximum proportion of a single variable, the variable greater than the setting will be deleted. |
iv_set |
IV value minimum threshold, variable IV value less than the setting will be deleted. |
char_to_number |
Whether to convert character variables to numeric. |
na.omit |
na.omit returns the object with incomplete cases removed. |
A data frame.
accepts <- read.csv( system.file( "extdata" , "accepts.csv",package = "autoScorecard" )) fff1 <- filter_var( df = accepts, key_var = "application_id", y_var = "bad_ind", missing_rate = 0, single_var_rate = 1, iv_set = 0.02 )
accepts <- read.csv( system.file( "extdata" , "accepts.csv",package = "autoScorecard" )) fff1 <- filter_var( df = accepts, key_var = "application_id", y_var = "bad_ind", missing_rate = 0, single_var_rate = 1, iv_set = 0.02 )
Function to Calculate IV Value
get_IV(df, feat, label, E = 0, woeInf.rep = 1e-04)
get_IV(df, feat, label, E = 0, woeInf.rep = 1e-04)
df |
A data.frame with independent variables and target variable. |
feat |
A name of dependent variable. |
label |
A name of target variable. |
E |
Constant, should be set to [0,1], used to prevent calculation overflow due to no data in binning. |
woeInf.rep |
Woe replaces the constant, and when woe is positive or negative infinity, it is replaced by a constant. |
A data frame including counts, proportions, odds, woe, and IV values for each stratum.
accepts <- read.csv( system.file( "extdata", "accepts.csv", package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) iv1 = get_IV( df= feature ,feat ='tot_derog' , label ='bad_ind' )
accepts <- read.csv( system.file( "extdata", "accepts.csv", package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) iv1 = get_IV( df= feature ,feat ='tot_derog' , label ='bad_ind' )
Manually Input Parameters to Generate Scorecards
noauto_scorecard( bins_card, fit, bins_woe, points0 = 600, odds0 = 1/19, pdo = 50, k = 2 )
noauto_scorecard( bins_card, fit, bins_woe, points0 = 600, odds0 = 1/19, pdo = 50, k = 2 )
bins_card |
Binning template. |
fit |
See glm stats. |
bins_woe |
A data frame of woe with independent variables and target variable. |
points0 |
Base point. |
odds0 |
odds. |
pdo |
Point-to Double Odds. |
k |
Each scale doubles the probability of default several times. |
A data frame with score ratings.
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] treebins_train <- bins_tree( df = train, key_var = "application_id", y_var="bad_ind", max_depth=3, p=0.1) woe_train <- rep_woe( df= train , key_var = "application_id", y_var = "bad_ind" , tool = treebins_train ,var_label = "variable",col_woe = 'woe', lower = 'lower' , upper = 'upper') woe_test <- rep_woe( df = test , key_var ="application_id", y_var= "bad_ind", tool = treebins_train ,var_label= "variable", col_woe = 'woe', lower = 'lower' ,upper = 'upper' ) lg <- stats::glm( bad_ind~. , family = stats::binomial( link = 'logit' ) , data = woe_train ) lg_both <- stats::step( lg , direction = "both") Score1 <- noauto_scorecard( bins_card= woe_test , fit =lg_both , bins_woe = treebins_train , points0 = 600 , odds0 = 1/20 , pdo = 50 )
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] treebins_train <- bins_tree( df = train, key_var = "application_id", y_var="bad_ind", max_depth=3, p=0.1) woe_train <- rep_woe( df= train , key_var = "application_id", y_var = "bad_ind" , tool = treebins_train ,var_label = "variable",col_woe = 'woe', lower = 'lower' , upper = 'upper') woe_test <- rep_woe( df = test , key_var ="application_id", y_var= "bad_ind", tool = treebins_train ,var_label= "variable", col_woe = 'woe', lower = 'lower' ,upper = 'upper' ) lg <- stats::glm( bad_ind~. , family = stats::binomial( link = 'logit' ) , data = woe_train ) lg_both <- stats::step( lg , direction = "both") Score1 <- noauto_scorecard( bins_card= woe_test , fit =lg_both , bins_woe = treebins_train , points0 = 600 , odds0 = 1/20 , pdo = 50 )
Manually Input Parameters to Generate Scorecards The basic score is dispersed into each feature score
noauto_scorecard2( bins_card, fit, bins_woe, points0 = 600, odds0 = 1/19, pdo = 50, k = 3 )
noauto_scorecard2( bins_card, fit, bins_woe, points0 = 600, odds0 = 1/19, pdo = 50, k = 3 )
bins_card |
Binning template. |
fit |
See glm stats. |
bins_woe |
Base point. |
points0 |
odds. |
odds0 |
Point-to Double Odds. |
pdo |
A data frame of woe with independent variables and target variable. |
k |
Each scale doubles the probability of default several times. |
A data frame with score ratings.
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] treebins_train <- bins_tree( df = train, key_var = "application_id", y_var="bad_ind", max_depth=3, p=0.1) woe_train <- rep_woe( df= train , key_var = "application_id", y_var = "bad_ind" , tool = treebins_train ,var_label = "variable",col_woe = 'woe', lower = 'lower' , upper = 'upper') woe_test <- rep_woe( df = test , key_var ="application_id", y_var= "bad_ind", tool = treebins_train ,var_label= "variable", col_woe = 'woe', lower = 'lower' ,upper = 'upper' ) lg <- stats::glm( bad_ind~. , family = stats::binomial( link = 'logit' ) , data = woe_train ) lg_both <- stats::step( lg , direction = "both") Score2 <- noauto_scorecard2( bins_card= woe_test , fit =lg_both , bins_woe = treebins_train , points0 = 600 , odds0 = 1/20 , pdo = 50 )
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] treebins_train <- bins_tree( df = train, key_var = "application_id", y_var="bad_ind", max_depth=3, p=0.1) woe_train <- rep_woe( df= train , key_var = "application_id", y_var = "bad_ind" , tool = treebins_train ,var_label = "variable",col_woe = 'woe', lower = 'lower' , upper = 'upper') woe_test <- rep_woe( df = test , key_var ="application_id", y_var= "bad_ind", tool = treebins_train ,var_label= "variable", col_woe = 'woe', lower = 'lower' ,upper = 'upper' ) lg <- stats::glm( bad_ind~. , family = stats::binomial( link = 'logit' ) , data = woe_train ) lg_both <- stats::step( lg , direction = "both") Score2 <- noauto_scorecard2( bins_card= woe_test , fit =lg_both , bins_woe = treebins_train , points0 = 600 , odds0 = 1/20 , pdo = 50 )
Data Painter Function Draw K-S diagram, Lorenz diagram, lift diagram and AUC diagram.
plot_board(label, pred)
plot_board(label, pred)
label |
A target variable. |
pred |
A predictor variable. |
No return value, called for side effects
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] treebins_train <- bins_tree( df = train, key_var = "application_id", y_var="bad_ind", max_depth=3, p=0.1) woe_train <- rep_woe( df= train , key_var = "application_id", y_var = "bad_ind" , tool = treebins_train ,var_label = "variable",col_woe = 'woe', lower = 'lower' , upper = 'upper') woe_test <- rep_woe( df = test , key_var ="application_id", y_var= "bad_ind", tool = treebins_train ,var_label= "variable", col_woe = 'woe', lower = 'lower' ,upper = 'upper' ) lg<-stats::glm(bad_ind~.,family=stats::binomial(link='logit'),data= woe_train) lg_both<-stats::step(lg,direction = "both") logit<-stats::predict(lg_both,woe_test) woe_test$lg_both_p<-exp(logit)/(1+exp(logit)) plot_board( label= woe_test$bad_ind, pred = woe_test$lg_both_p )
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] treebins_train <- bins_tree( df = train, key_var = "application_id", y_var="bad_ind", max_depth=3, p=0.1) woe_train <- rep_woe( df= train , key_var = "application_id", y_var = "bad_ind" , tool = treebins_train ,var_label = "variable",col_woe = 'woe', lower = 'lower' , upper = 'upper') woe_test <- rep_woe( df = test , key_var ="application_id", y_var= "bad_ind", tool = treebins_train ,var_label= "variable", col_woe = 'woe', lower = 'lower' ,upper = 'upper' ) lg<-stats::glm(bad_ind~.,family=stats::binomial(link='logit'),data= woe_train) lg_both<-stats::step(lg,direction = "both") logit<-stats::predict(lg_both,woe_test) woe_test$lg_both_p<-exp(logit)/(1+exp(logit)) plot_board( label= woe_test$bad_ind, pred = woe_test$lg_both_p )
PSI Calculation Function
psi_cal(df_train, df_test, feat, label, nbins = 10)
psi_cal(df_train, df_test, feat, label, nbins = 10)
df_train |
Train data. |
df_test |
Test data. |
feat |
A name of index variable name. |
label |
A name of target variable. |
nbins |
Number of bins. |
A data frame of PSI.
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] treebins_train <- bins_tree( df = train, key_var = "application_id", y_var="bad_ind", max_depth=3, p=0.1) woe_train <- rep_woe( df= train , key_var = "application_id", y_var = "bad_ind" , tool = treebins_train ,var_label = "variable",col_woe = 'woe', lower = 'lower' , upper = 'upper') woe_test <- rep_woe( df = test , key_var ="application_id", y_var= "bad_ind", tool = treebins_train ,var_label= "variable", col_woe = 'woe', lower = 'lower' ,upper = 'upper' ) lg <- stats::glm( bad_ind~. , family = stats::binomial( link = 'logit' ) , data = woe_train ) lg_both <- stats::step( lg , direction = "both") Score_2 <- noauto_scorecard( bins_card= woe_test , fit =lg_both , bins_woe = treebins_train , points0 = 600 , odds0 = 1/20 , pdo = 50 ) Score_1<- noauto_scorecard( bins_card = woe_train, fit = lg_both, bins_woe = treebins_train, points0 = 600, odds0 = 1/20, pdo = 50 ) psi_1<- psi_cal( df_train = Score_1$data_score , df_test = Score_2$data_score, feat = 'Score',label ='bad_ind' , nbins =10 )
accepts <- read.csv( system.file( "extdata", "accepts.csv" , package = "autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) d = sort( sample( nrow( feature ), nrow( feature )*0.7)) train <- feature[d,] test <- feature[-d,] treebins_train <- bins_tree( df = train, key_var = "application_id", y_var="bad_ind", max_depth=3, p=0.1) woe_train <- rep_woe( df= train , key_var = "application_id", y_var = "bad_ind" , tool = treebins_train ,var_label = "variable",col_woe = 'woe', lower = 'lower' , upper = 'upper') woe_test <- rep_woe( df = test , key_var ="application_id", y_var= "bad_ind", tool = treebins_train ,var_label= "variable", col_woe = 'woe', lower = 'lower' ,upper = 'upper' ) lg <- stats::glm( bad_ind~. , family = stats::binomial( link = 'logit' ) , data = woe_train ) lg_both <- stats::step( lg , direction = "both") Score_2 <- noauto_scorecard( bins_card= woe_test , fit =lg_both , bins_woe = treebins_train , points0 = 600 , odds0 = 1/20 , pdo = 50 ) Score_1<- noauto_scorecard( bins_card = woe_train, fit = lg_both, bins_woe = treebins_train, points0 = 600, odds0 = 1/20, pdo = 50 ) psi_1<- psi_cal( df_train = Score_1$data_score , df_test = Score_2$data_score, feat = 'Score',label ='bad_ind' , nbins =10 )
Replace Feature Data by Binning Template
rep_woe(df, key_var, y_var, tool, var_label, col_woe, lower, upper)
rep_woe(df, key_var, y_var, tool, var_label, col_woe, lower, upper)
df |
A data.frame with independent variables and target variable. |
key_var |
A name of index variable name. |
y_var |
A name of target variable. |
tool |
Binning template. |
var_label |
The name of the characteristic variable. |
col_woe |
The name of the woe variable |
lower |
The name of the binning lower bound. |
upper |
The name of the binning upper bound. |
A data frame of woe
accepts <- read.csv( system.file( "extdata", "accepts.csv", package ="autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) all2 <- bins_tree( df = feature, key_var = "application_id", y_var = "bad_ind", max_depth = 3, p= 0.1) re2 <- rep_woe( df= feature ,key_var = "application_id", y_var = "bad_ind", tool = all2, var_label = "variable",col_woe ='woe', lower ='lower',upper ='upper')
accepts <- read.csv( system.file( "extdata", "accepts.csv", package ="autoScorecard" )) feature <- stats::na.omit( accepts[,c(1,3,7:23)] ) all2 <- bins_tree( df = feature, key_var = "application_id", y_var = "bad_ind", max_depth = 3, p= 0.1) re2 <- rep_woe( df= feature ,key_var = "application_id", y_var = "bad_ind", tool = all2, var_label = "variable",col_woe ='woe', lower ='lower',upper ='upper')