Mortgage Default Demo

View the training data

Exploratory data analysis

Correlation chart

View the predicted data

import asyncio #import pyodide import numpy as np import pandas as pd import panel as pn import matplotlib.pyplot as plt import seaborn as sns from pyodide.http import open_url from panel.io.pyodide import show print('Downloading the training and test data . . .') data_train = pd.read_csv(open_url('/mtgdefault.csv')) #data_train.sample(3) df = pd.DataFrame(data_train) print('Downloading the validation data . . .') data_test = pd.read_csv(open_url('/mtgdefaulttest.csv')) print('View the training and test data') #

source_table = pn.widgets.Tabulator(pagination='local', page_size=10) #source_table = pn.widgets.Tabulator(pagination='remote', layout='fit_data_stretch', width=330, page_size=10) source_table.value = data_train #new filename, button = source_table.download_menu( text_kwargs={'name': 'Enter filename', 'value': 'default.csv'}, button_kwargs={'name': 'Download table'} ) source_table = pn.Row( pn.Column(filename, button), source_table ) #new end document.getElementById('source_table').style.display = 'block' await show(source_table, 'source_table') print('View the correlation matrix') import matplotlib.pyplot as plt import numpy as np x = np.random.randn(100) y = np.random.randn(100) fig, ax = plt.subplots() ax.scatter(x, y) fig await show(fig, 'fig') from sklearn.model_selection import train_test_split X_all = data_train.drop(['outcome', 'ID'], axis=1) y_all = data_train['outcome'] num_test = 0.20 X_train, X_test, y_train, y_test = train_test_split(X_all, y_all, test_size=num_test, random_state=23) print('Split of training and test data complete . . .') from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import make_scorer, accuracy_score from sklearn.model_selection import GridSearchCV # Choose the type of classifier. clf = RandomForestClassifier() #old parameters, very accurate # Choose some parameter combinations to try #parameters = {'n_estimators': [4, 6, 9], # 'max_features': ['log2', 'sqrt','auto'], # 'criterion': ['entropy', 'gini'], # 'max_depth': [2, 3, 5, 10], # 'min_samples_split': [2, 3, 5], # 'min_samples_leaf': [1,5,8] # } # Choose some parameter combinations to try parameters = {'n_estimators': [2], 'max_features': ['auto'], 'criterion': ['entropy'], 'max_depth': [1], 'min_samples_split': [2], 'min_samples_leaf': [5] } # Type of scoring used to compare parameter combinations acc_scorer = make_scorer(accuracy_score) print('Running the model on training data . . .') # Run the grid search grid_obj = GridSearchCV(clf, parameters, scoring=acc_scorer) grid_obj = grid_obj.fit(X_train, y_train) # Set the clf to the best combination of parameters clf = grid_obj.best_estimator_ # Fit the best algorithm to the data. clf.fit(X_train, y_train) predictions = clf.predict(X_test) print(accuracy_score(y_test, predictions)) print('Making predictions on new data that does not have mortgage default status yet') ids = data_test['ID'] cred = data_test['Credit_score'] predictions = clf.predict(data_test.drop('ID', axis=1)) output = pd.DataFrame({ 'ID' : ids, 'Credit_score' : cred, 'outcome': predictions }) #outputcsv = output.to_csv('predictions.csv', index = False) #print(output.head()) table = pn.widgets.Tabulator(pagination='remote', page_size=10) table.value = pd.DataFrame({ 'ID' : ids, 'Credit_score' : cred, 'outcome': predictions }) document.getElementById('table').style.display = 'block' await show(table, 'table') #await show(pred, 'pred')