import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import norm
import statsmodels.api as sm
from statsmodels.base.model import GenericLikelihoodModel
from scipy.optimize import minimize
import datetime
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
pd.options.display.max_rows = 10
pd.options.display.max_columns = 40
# allow google colab to read google drive
from google.colab import drive
drive.mount('/content/drive')
# take a glimpse of the columns that are imported
data_all_cols = pd.read_hdf('/content/drive/My Drive/quarterly_sample.h5',start=0,stop=1)
data_all_cols.columns
# import data
# do not import MI Percent since there are so many NA values
all_vars = [
'loan seq num', 'period', 'default', 'prepay', 'channel','curr upb', 'upb_prev','num of borrowers',
'loan age', 'credit score', 'dti', 'ltv',
'orig interest rate', 'loan purpose',
'prev ltv', 'Housing Starts','Housing Starts_orig',
'Building Permits', 'Building Permits_orig','New Home Sales','New Home Sales_orig',
'Existing Home Sales','Existing Home Sales_orig',
'OECD Leading Indicator YoY', 'Treasury 1yr','Treasury 1yr_orig',
'Treasury 30yr','Treasury 30yr_orig', 'State Unemployment Rate','State Unemployment Rate_orig', 'Real GDP YoY',
'ZillowHouseValue','ZillowHouseValue_orig','30yr Fixed Rate Mortgage Average','SP500 YoY']
data_all = pd.read_hdf('/content/drive/My Drive/quarterly_sample.h5',columns=all_vars)
# drop records where the credit score is NA
data_all = data_all.dropna(subset=['credit score'])
data_all = data_all[data_all['credit score']<9999]
data_all.shape
# drop anything left that is NA
data_all=data_all.dropna()
data_all.shape
# sampling
# create a training and test set where the proportion of
# defaulted loans is the same in both sets
def sampling(size, df):
non_defaults = df[~df['loan seq num'].isin(df[df.default == 1]['loan seq num'].unique())]
defaults = df[df['loan seq num'].isin(df[df.default == 1]['loan seq num'].unique())]
mortgage_count_n = non_defaults.groupby('loan seq num')['period'].count()
mortgage_weights_n = mortgage_count_n/np.sum(mortgage_count_n)
np.random.seed(seed=1)
sample_index_n = np.random.choice(mortgage_count_n.index, int(size*non_defaults.shape[0]/np.average(mortgage_count_n, weights=mortgage_weights_n)), replace=False, p=mortgage_weights_n)
mortgage_count_y = defaults.groupby('loan seq num')['period'].count()
mortgage_weights_y = mortgage_count_y/np.sum(mortgage_count_y)
np.random.seed(seed=1)
sample_index_y = np.random.choice(mortgage_count_y.index, int(size*defaults.shape[0]/np.average(mortgage_count_y, weights=mortgage_weights_y)), replace=False, p=mortgage_weights_y)
sample = df[df['loan seq num'].isin(sample_index_n) | df['loan seq num'].isin(sample_index_y)]
out_sample = df[~df['loan seq num'].isin(sample_index_n) & ~df['loan seq num'].isin(sample_index_y)]
return sample,out_sample
data_all_train,data_all_test = sampling(.7,data_all)
print(data_all_train.shape)
print(data_all_test.shape)
def create_X_y_matrix(dataset):
# To get rid of annoying warning
X = dataset[all_vars]
y = X[['default','prepay']]
y['nothing'] = 1-y['default']-y['prepay']
# Transformed Variables
# takes min of loan age or 120 (to make sure squared terms don't diverge)
X['loan age'] = X[['loan age']].clip_upper(120)
X['loan age squared'] = X['loan age']**2
X['loan_age_cubed'] = X['loan age']**3
X['30yr_diff'] = X['Treasury 30yr'] - X['Treasury 30yr_orig']
X['1yr_diff'] = X['Treasury 1yr'] - X['Treasury 1yr_orig']
X['ZillowHouseChg'] = X['ZillowHouseValue']/X['ZillowHouseValue_orig']
X['State Unemployment Rate Squared'] = X['State Unemployment Rate']**2
X['purpose_purchase'] = (X['loan purpose']=='P')*1
X['purpose_cashout'] = (X['loan purpose']=='C')*1
X['2_borrowers'] = (X['num of borrowers']==2)*1
X['chan_broker'] = (X['channel']=='B')*1
X['chan_correspondent'] = (X['channel']=='C')*1
X['chan_not_specified'] = (X['channel']=='T')*1
X['Housing Start_diff'] = X['Housing Starts']-X['Housing Starts_orig']
X['Building Permits_diff'] = X['Building Permits']-X['Building Permits_orig']
X['New Home Sales_diff'] = X['New Home Sales']-X['New Home Sales_orig']
X['Existing Home Sales_diff'] = X['Existing Home Sales']-X['Existing Home Sales_orig']
X['State Unemployment Rate_diff'] = X['State Unemployment Rate']-X['State Unemployment Rate_orig']
return X, y
create_X_y_matrix(data_all_train)[0].columns
This will fit the model by relying on regularization for variable selection
import statsmodels.api as st
from scipy import stats
stats.chisqprob = lambda chisq, df: stats.chi2.sf(chisq, df)
# this function properly sets up the deisgn matrix and regularization
# penalties for conducting multinomial logistic regression
def setup_MN_variables(train_or_test,large_penalty,small_penalty):
if train_or_test == 'train':
X,y = create_X_y_matrix(data_all_train)
else:
X,y = create_X_y_matrix(data_all_test)
# Define the variables to be used
mn_vars = ['loan age', 'credit score', 'dti', 'ltv','orig interest rate',
'Housing Starts','Building Permits','New Home Sales','Existing Home Sales','OECD Leading Indicator YoY',
'Treasury 1yr','Treasury 30yr', 'State Unemployment Rate', 'Real GDP YoY','ZillowHouseValue',
'30yr Fixed Rate Mortgage Average','SP500 YoY', '2_borrowers','loan age squared','loan_age_cubed',
'30yr_diff','1yr_diff','ZillowHouseChg','State Unemployment Rate Squared','purpose_purchase',
'purpose_cashout','chan_broker','chan_correspondent','chan_not_specified','Housing Start_diff',
'Building Permits_diff','New Home Sales_diff','Existing Home Sales_diff','State Unemployment Rate_diff']
#Initial Regularization Penalty - ALL VARIABLES
alpha_penalty = np.concatenate((np.array([0]),
np.repeat(small_penalty,34))) # penalty for default variables
alpha_penalty = np.concatenate((alpha_penalty,np.array([0]),
np.repeat(small_penalty,34))) # penalty for prepay variables
X_MN = X[mn_vars]
X_MN = X[mn_vars].T.groupby(level=0).first().T
X_MN = X_MN[mn_vars]
# Scale the design matrix
scaler = StandardScaler()
X_MN = pd.DataFrame(scaler.fit_transform(X_MN), columns=X_MN.columns)
X_MN.index = y.index
# Insert constant term
X_MN.insert(0, 'constant', 1)
mn_vars = list(X_MN.columns.values)
# Define target vector
y_MN = y['default']+2*y['prepay']
return X_MN, y_MN, mn_vars, alpha_penalty
# this runs multinomial logistc regression and
# returns the model
def run_MN_regression(X_MN,y_MN,alpha_penalty):
# Fit the model
mdl = st.MNLogit(y_MN, X_MN)
mdl_fit = mdl.fit_regularized(method='l1_cvxopt_cp',alpha=alpha_penalty,maxiter=15,
trim_mode='size',size_trim_tol=0.01)
return mdl_fit
# calculates the average log likelihood of the model
# on the dataset (a measure of performance)
def calc_avg_log_like(mdl,y,X_MN):
# calculate the log likelihood
predictions = mdl.predict(X_MN)
outcomes = pd.DataFrame()
outcomes[0] = (y==0)*1
outcomes[1] = (y==1)*1
outcomes[2] = (y==2)*1
avg_log_like_full = np.sum(np.log(np.sum(outcomes*predictions,axis=1)))/outcomes.shape[0]
return avg_log_like_full
# this model returns the results from one run of multinomial logistic
# regression by calling the above functions
def results_one_model(alpha_large_penalty,alpha_small_penalty):
# transform training variables
X_MN,y_MN,mn_vars,alpha_penalty = setup_MN_variables('train',alpha_large_penalty,alpha_small_penalty)
# fit model
mdl_fit = run_MN_regression(X_MN,y_MN,alpha_penalty)
# train log_likelihoods
train_avg_ll = mdl_fit.llf/X_MN.shape[0]
# transform testing variables
X_MN,y_MN,mn_vars,alpha_penalty = setup_MN_variables('test',alpha_large_penalty,alpha_small_penalty)
# test log likelihood
test_avg_ll = calc_avg_log_like(mdl_fit,y_MN,X_MN)
return train_avg_ll, test_avg_ll, mdl_fit.params, mdl_fit.tvalues
# this fits a constrained model by regularizing the variables which
# were statistically insignificant in the previous model
def results_model_constrained(param_t_values, alpha_large_penalty,alpha_small_penalty):
# transform training variables
X_MN,y_MN,mn_vars,alpha_penalty = setup_MN_variables('train',alpha_large_penalty,alpha_small_penalty)
#### CUSTOM REGULARIZATION PENALTY BASED ON PREVIOUS ITERATION ####
# create custom alpha_penalty vector based on significance of parameters
t_vals = np.concatenate([param_t_values[0],param_t_values[1]])
t_vals = np.nan_to_num(t_vals)
t_vals = np.absolute(t_vals)
# if t-value not greater than 2 (~95% confident) set it to max penalty
alpha_penalty = (t_vals<2)*alpha_large_penalty + (t_vals>=2)*alpha_small_penalty
print('ALPHA PENALTY')
print(alpha_penalty)
# do not constrain constants
alpha_penalty[0]=0
alpha_penalty[int(alpha_penalty.shape[0]/2)]=0
#### FIT MODEL AND REPORT RESULTS
# fit model
mdl_fit = run_MN_regression(X_MN,y_MN,alpha_penalty)
# train/text log_likelihoods
train_avg_ll = mdl_fit.llf/X_MN.shape[0]
# transform testing variables
X_MN,y_MN,mn_vars,alpha_penalty = setup_MN_variables('test',alpha_large_penalty,alpha_small_penalty)
# test log likelihood
test_avg_ll = calc_avg_log_like(mdl_fit,y_MN,X_MN)
return train_avg_ll, test_avg_ll, mdl_fit.params, mdl_fit.tvalues, mdl_fit
# this fits a model, and then fits another model that has regularized away
# the statistically insignificant variables
def master_model_fit(alpha_large_penalty, alpha_small_penalty):
initial_train_avg_ll, initial_test_avg_ll, initial_mdl_params, initial_mdl_tvals = results_one_model(alpha_large_penalty,alpha_small_penalty)
print('Initial Model Fit')
cons_train_avg_ll, cons_test_avg_ll, cons_mdl_params, cons_mdl_tvals, full_model = results_model_constrained(initial_mdl_tvals, alpha_large_penalty,alpha_small_penalty)
return initial_train_avg_ll, initial_test_avg_ll, initial_mdl_params, initial_mdl_tvals, cons_train_avg_ll, cons_test_avg_ll, cons_mdl_params, cons_mdl_tvals, full_model
def graph_model(params):
fig, ax = plt.subplots(2, 4,figsize=(25, 10))
parpar_def = (params.T).iloc[0]
parpar_pre = (params.T).iloc[1]
param_def=parpar_def.values.reshape(parpar_def.shape[0],1)
param_pre=parpar_pre.values.reshape(parpar_pre.shape[0],1)
# i=0 train, i=1 test
for i in range(2):
if i==0:
# GET DESIGN MATRIX
X_MN, y_MN, mn_vars, alpha_penalty = setup_MN_variables('train',0,0)
if i==1:
X_MN, y_MN, mn_vars, alpha_penalty = setup_MN_variables('test',0,0)
X_def = X_MN[mn_vars]
X_pre = X_MN[mn_vars]
# remove duplicated columns
X_def = np.array(X_def.loc[:,~X_def.columns.duplicated()])
X_pre = np.array(X_pre.loc[:,~X_pre.columns.duplicated()])
if i==0:
X, yy = create_X_y_matrix(data_all_train)
X_assess, yy = create_X_y_matrix(data_all_train)
if i==1:
X, yy = create_X_y_matrix(data_all_test)
X_assess, yy = create_X_y_matrix(data_all_test)
X_assess['prob_def'] = np.exp(np.matmul(X_def,param_def))/(1+np.exp(np.matmul(X_def,param_def))+np.exp(np.matmul(X_pre,param_pre)))
X_assess['prob_pre'] = np.exp(np.matmul(X_pre,param_pre))/(1+np.exp(np.matmul(X_def,param_def))+np.exp(np.matmul(X_pre,param_pre)))
X_assess['prob_other'] = 1 - X_assess['prob_def'] - X_assess['prob_pre']
X_assess = X_assess[['prob_def','prob_pre','prob_other']]
if i ==0:
assess = data_all_train[['period','prepay','default','upb_prev']]
if i ==1:
assess = data_all_test[['period','prepay','default','upb_prev']]
assess = X_assess.merge(assess,left_index=True,right_index=True)
# this is code to do it by amount defaulted not number
assess['expected_prepay'] = assess['prob_pre']*assess['upb_prev']
assess['expected_default'] = assess['prob_def']*assess['upb_prev']
assess['actual_prepay'] = assess['prepay']*assess['upb_prev']
assess['actual_default'] = assess['default']*assess['upb_prev']
grouped = assess
grouped['period'] = assess['period']
grouped = assess.groupby('period')
# RATES
# cum actual prepays/defaults
prepays = grouped['prepay'].agg([np.mean])
defaults = grouped['default'].agg([np.mean])
# cum expected prepays/defaults
e_prepays = grouped['prob_pre'].agg([np.mean])
e_defaults = grouped['prob_def'].agg([np.mean])
ax[i,0].plot(prepays)
ax[i,0].plot(e_prepays)
ax[i,0].legend(['Actual','Expected'])
ax[i,2].plot(defaults)
ax[i,2].plot(e_defaults)
ax[i,2].legend(['Actual','Expected'])
# AMOUNTS
# cum actual prepays/defaults
prepays = grouped['actual_prepay'].agg([np.sum])
defaults = grouped['actual_default'].agg([np.sum])
# cum expected prepays/defaults
e_prepays = grouped['expected_prepay'].agg([np.sum])
e_defaults = grouped['expected_default'].agg([np.sum])
ax[i,1].plot(prepays)
ax[i,1].plot(e_prepays)
ax[i,1].legend(['Actual','Expected'])
ax[i,3].plot(defaults)
ax[i,3].plot(e_defaults)
ax[i,3].legend(['Actual','Expected'])
if i ==0:
ax[i,0].set_title('Actual v Expected Prepay Rate - Train')
ax[i,2].set_title('Actual v Expected Default Rate - Train')
ax[i,1].set_title('Actual v Expected Prepay Amount - Train')
ax[i,3].set_title('Actual v Expected Default Amount - Train')
if i ==1:
ax[i,0].set_title('Actual v Expected Prepay Rate - Test')
ax[i,2].set_title('Actual v Expected Default Rate - Test')
ax[i,1].set_title('Actual v Expected Prepay Amount - Test')
ax[i,3].set_title('Actual v Expected Default Amount - Test')
return assess, grouped
# Fit a model in two steps with a small regularization penalty of 25
reg_penalty=25
a,b,c,d,e,f,g,h,full_model = master_model_fit(10000,reg_penalty)
assess_model,grouped_model = graph_model(g)
model_output = pd.concat((full_model.params.T, full_model.tvalues.T,full_model.bse.T))
model_output.index = np.array(['Coeff Default','Coeff Prepay',
'T-Stat Default','T Stat Prepay',
'SE Default','SE Prepay'])
model_output
model_output.to_csv('/content/drive/My Drive/final_model_output.csv')
Calculates the average difference between the actual default rate and the expected default rate
act_def = grouped_model.agg([np.mean])[('prob_def','mean')]
exp_def = grouped_model.agg([np.mean])[('default','mean')]
act_min_exp = act_def-exp_def
act_min_exp = abs(act_min_exp)
print('Average Difference between actual default rate and expected default rate')
print(act_min_exp.mean())
import matplotlib.patches as mpatches
fig, ax = plt.subplots(1, 2,figsize=(20, 5))
# Set ylims so they are roughly the same proportion
ax[0].hist(assess_model[assess_model['default']==1]['prob_def'],alpha=.5,color='red',bins=np.arange(0,.05,.001),density=False)
ax[0].grid(False)
ax[0].set_ylabel('Defaulting Mortgages')
ax[0].set_ylim(0,2196)
ax[0].set_xlabel('Default Probability Assigned By Model')
ax2 = ax[0].twinx()
ax2.set_ylim(0,687919)
ax2.hist(assess_model[assess_model['default']==0]['prob_def'],alpha=.5, color='green',bins=np.arange(0,.05,.001),density=False)
ax2.set_ylabel('Non-Defaulting Mortgages')
ax2.set_title('Test Set - Default Probabilities Assigned By Model')
ax2.grid(False)
red_patch = mpatches.Patch(color='red', label='Defaulted Mortgages - Month of Default',alpha=.5)
green_patch = mpatches.Patch(color='green', label='Non-Defaulting Mortgages',alpha=.5)
ax[0].legend(handles=[red_patch,green_patch])
ax[1].hist(assess_model[assess_model['prepay']==1]['prob_pre'],alpha=.5,color='red',bins=np.arange(0,.15,.003),density=False)
ax[1].grid(False)
ax[1].set_ylim(0,7278)
ax[1].set_ylabel('Prepaying Mortgages')
ax[1].set_xlabel('Prepay Probability Assigned By Model')
ax3 = ax[1].twinx()
ax3.set_ylim(0,175000)
ax3.hist(assess_model[assess_model['prepay']==0]['prob_pre'],alpha=.5, color='green',bins=np.arange(0,.15,.003),density=False)
ax3.set_ylabel('Non-Prepaying Mortgages')
ax3.set_title('Test Set - Prepay Probabilities Assigned By Model')
ax3.grid(False)
red_patch = mpatches.Patch(color='red', label='Prepaid Mortgages - Month of Prepay',alpha=.5)
green_patch = mpatches.Patch(color='green', label='Non-Prepaying Mortgages',alpha=.5)
ax[1].legend(handles=[red_patch,green_patch])
fig.subplots_adjust(wspace=.3)
plt.show()
print(assess_model[assess_model['prepay']==1]['prob_pre'].shape)
print(assess_model[assess_model['prepay']==0]['prob_pre'].shape)
from matplotlib.lines import Line2D
fig, ax = plt.subplots(3, 4,figsize=(20, 8))
fig.tight_layout(rect=[0, 0.03, 1, 0.95])
potential_years = ['00','01','02','03','04','05','06','07','08','09','10','11']
for i in range(12):
row = i//4
col = i%4
indicies_year = data_all_test[data_all_test['loan seq num'].astype(str).str[2:4]==potential_years[i]].index
assess_year = assess_model.loc[indicies_year]
num_loans_year = data_all_test.loc[assess_year.index]['loan seq num'].unique().shape[0]
grouped_year = assess_year.groupby('period')
cumsum_def_year = grouped_year[['default','prob_def']].agg([np.sum])/num_loans_year
cumsum_def_year = cumsum_def_year.cumsum()
datemin = np.datetime64(2000, 'Y')
ax[row,col].set_xlim(datetime.date(2000, 1, 1),datetime.date(2018, 12, 5) )
ax[row,col].set_ylim(0,.2)
ax[row,col].plot(cumsum_def_year['default'])
ax[row,col].plot(cumsum_def_year['prob_def'])
ax[row,col].set_title('Originated in 20'+potential_years[i])
legend_lines = [Line2D([0], [0], color='C0', lw=4),
Line2D([0], [0], color='C1', lw=4)]
plt.figlegend(legend_lines,('Actual', 'Expected'),'lower center',ncol=2,fontsize='medium')
plt.show()
Some extra code on how to code up a custom model with custom alpha penalties
def custom_model(custom_alpha_penalty,alpha_small_penalty,alpha_large_penalty):
# transform training variables
X_MN,y_MN,mn_vars,alpha_penalty = setup_MN_variables('train',0,0)
# fit model
mdl_fit = run_MN_regression(X_MN,y_MN,custom_alpha_penalty)
# train/test log_likelihoods
train_avg_ll = mdl_fit.llf/X_MN.shape[0]
# transform testing variables
X_MN,y_MN,mn_vars,alpha_penalty = setup_MN_variables('test',alpha_large_penalty,alpha_small_penalty)
# test log likelihood
test_avg_ll = calc_avg_log_like(mdl_fit,y_MN,X_MN)
return train_avg_ll, test_avg_ll, mdl_fit.params, mdl_fit.tvalues, mdl_fit
large_penalty = 10000
small_penalty = 25
# David's Initial Regularization - INITIAL KILL VARIABLES NOT INTUITIVELY USEFUL
custom_alpha_penalty = np.concatenate((np.array([0]),
np.array([small_penalty,small_penalty,small_penalty,small_penalty,large_penalty,
large_penalty,small_penalty,large_penalty,small_penalty,large_penalty,
large_penalty,large_penalty,small_penalty,small_penalty,large_penalty,
large_penalty,small_penalty,small_penalty,small_penalty,small_penalty,
large_penalty,large_penalty,small_penalty,small_penalty,small_penalty,
small_penalty,small_penalty,small_penalty,small_penalty,large_penalty,
large_penalty,large_penalty,large_penalty,large_penalty]))) # penalty for default variables
custom_alpha_penalty = np.concatenate((custom_alpha_penalty,np.array([0]),
np.array([small_penalty,small_penalty,small_penalty,small_penalty,large_penalty,
large_penalty,small_penalty,small_penalty,small_penalty,large_penalty,
large_penalty,large_penalty,small_penalty,small_penalty,large_penalty,
large_penalty,small_penalty,small_penalty,small_penalty,small_penalty,
small_penalty,large_penalty,small_penalty,small_penalty,small_penalty,
small_penalty,small_penalty,small_penalty,small_penalty,large_penalty,
large_penalty,large_penalty,large_penalty,large_penalty]))) # penalty for prepay variables
trainll,testll, params, ts, mdlfit = custom_model(custom_alpha_penalty,small_penalty,large_penalty)
print(trainll,testll)
g = graph_model(params)