To use the given data to create a classification algorithm that will accurately predict if a person will default on their loan or not.
import numpy as np
from sklearn.cross_validation import train_test_split as tts
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import time
import seaborn as sns
# Read in Data
df = pd.read_csv('LoansTrainingSet.csv') #,dtype=str
df.head(10)
df.info()
df.isnull().sum()
for col in df.columns:
print 'Number of Unique {} values: '.format(col), df[col].nunique()
# Creating a fake data set to explore drop duplicates
data = {'fruit':['apple','banana','carrot','apple','apple'],'dessert':['cake','cookie','ice cream','cake','brownie']}
dataFr = pd.DataFrame(data)
print dataFr, '\n' # Fruit DataFrame
print dataFr.duplicated().sum() # Number of Duplicates
dataFr.duplicated(subset='dessert') # Bools of Number of Duplicates
# Using drop duplicates, if I specify the multiple columns, it only drops the row if the values are identical in all of the specified columns
#dataFr.drop_duplicates(subset=['fruit','dessert'],inplace=True)
dataFr.drop_duplicates(inplace=True)
#dataFr = dataFr.drop_duplicates()
print dataFr, '\n'
dataFr.info()
# Drop Duplicates where values are equal on ALL features
df.drop_duplicates(inplace=True)
print 'Down to {} rows.'.format(len(df))
print 'There are', df['Loan ID'].nunique(), 'values.'
print (len(df)-df['Loan ID'].nunique()), 'more Rows to drop.'
# There are still some duplicated rows
x = df.duplicated(subset=['Loan ID'])
rowsToDrop = []
for i,bool in enumerate(x):
if bool == True:
rowsToDrop.append(i)
print len(rowsToDrop)
rowsToDrop
def removeDups(indexOfDups,DF):
# This function loops throught the data and finds the duplicated rows
# Then it loops through the duplicated rows and return the row with the least number of nulls/odd values
indexesToDrop=[]
i=0
skip=0
for i,value in enumerate(indexOfDups):
if skip>0:
skip-=1
continue
j=1
indexesToCompare = [value-1,value]
nulls=[]
if (i+j)<(len(indexOfDups)-1):
while (indexOfDups[i+j]==value+j):
indexesToCompare.append(value+j)
j+=1
skip+=1
for index in indexesToCompare:
nullCount=0
for feature in df.iloc[index]:
if (pd.isnull(feature)==True)|(feature=='n/a')|(feature=='NA')|(feature=='#VALUE!')|(feature==99999999)|(feature=='nan'):
nullCount+=1
nulls.append(nullCount)
leastNulls = np.argmin(nulls)
del indexesToCompare[leastNulls]
indexesToDrop.append(indexesToCompare)
# print 'Still Working. Just Finished Looping on Index:',i
return indexesToDrop
print 'Number of Rows to Drop:', len(rowsToDrop), '\n'
start_time = time.time()
rowsToDelete = removeDups(rowsToDrop,df)
rowsToDelete = [item for sublist in rowsToDelete for item in sublist]
print '\n','Run Time:', (time.time()-start_time), 'seconds'
print len(rowsToDelete)
df.drop(df.index[rowsToDelete],inplace=True,axis=0)
'''for value in rowsToDelete:
df.drop(df.index[value])
print 'Just finished dropping row',value'''
print len(df['Loan ID']), len(df['Years in current job'])
# Can skip all those lines of code with this (1) line of code using groupby
'''import time
start_time = time.time()
df = df.groupby(df['Loan ID']).max()
print 'Run Time:', (time.time()-start_time)'''
# Kept the long lines of code above because it was actually significantly faster to loop throught the data than perform the groupby
# Run time approx 360 seconds vs 24 seconds for the large loop
df.info()
print df.isnull().sum(), '\n'
print df.duplicated().sum()
# Dropping Loan ID & Customer ID
df.drop(['Loan ID','Customer ID'],axis=1,inplace=True)
for col in df.columns:
print col, df[col].unique(), '\n'
# Fill Current Loan Amount
# Replace 99999999 Values
print df['Current Loan Amount'].value_counts(), '\n'
x = df[df['Current Loan Amount'] != 99999999]
df['Current Loan Amount'] = df['Current Loan Amount'].replace({99999999:x['Current Loan Amount'].median()})
print df['Current Loan Amount'].value_counts()
# I am assuming the credit scores > 1000 are a typo where they added a zero to the end of them
# Removing the last 0 in (4) digit credit scores
df['Credit Score'] = df['Credit Score'].apply(lambda x: x/10 if x>=1000 else x)
print df['Credit Score'].unique(), '\n'
# Fico Credit Scores range from 300 to 850. Checking to make sure the values fall within this range.
count = 0
for score in df['Credit Score']:
if (score<300)|(score>850):
print score
count +=1
print count
df['Credit Score'].fillna(df['Credit Score'].median(), inplace=True)
# Fill 'n/a' in Years in current job
# Assuming 'n/a' means they do not currently have a job
print df['Years in current job'].value_counts(), '\n'
df['Years in current job'] = df['Years in current job'].replace({'n/a':'No Job'})
print df['Years in current job'].value_counts()
# Fill Annual Income
print df['Annual Income'].median()
df['Annual Income'].fillna(df['Annual Income'].median(), inplace=True)
# Purpose has (2) values of 'other' & 'Other'
# Convert 'Other' values to 'other'
df['Purpose'] = df['Purpose'].apply(lambda x: x.lower() if x=='Other' else x)
print df['Purpose'].unique()
# for Feature 'Months since last delinquent', making NULL values (those who have never been delinquent on a payment) 500 value
# Should create a WasDelinquent (Y/N?) feature
monthsSinceLastDelinquentMaxPlus1 = df['Months since last delinquent'].max()+1
df['Months since last delinquent'].fillna(monthsSinceLastDelinquentMaxPlus1, inplace=True)
print df['Months since last delinquent'].max()
# Maximum Open Credit Has Nulls as '#VALUE!'
print len(df[df['Maximum Open Credit']=='#VALUE!']), '\n'
print df['Maximum Open Credit'].value_counts(), '\n'
df['Maximum Open Credit'].replace(to_replace='#VALUE!',value=0,inplace=True)
# Replace null Bankruptcy values with 0
df['Bankruptcies'].fillna(0,inplace=True)
# Replace null Tax Lien values with 0
df['Tax Liens'].fillna(0,inplace=True)
df.isnull().sum()
# Creating 'Has Job?' Feature
# 1 if have job, else 0
df['Has Job?'] = df['Years in current job'].apply(lambda x: 0 if x == 'No Job' else 1)
# Creating 'Been Delinquent?' Feature
# 1 if been delinquent, else 0
df['Been Delinquent?'] = df['Months since last delinquent'].apply(lambda x: 0 if x == monthsSinceLastDelinquentMaxPlus1 else 1)
# Creating 'Credit Problem?' Feature
# 1 if had Credit Problem, else 0
df['Credit Problem?'] = df['Number of Credit Problems'].apply(lambda x: 0 if x == 0 else 1)
# Creating 'Been Bankrupt?' Feature
# 1 if been Bankrupt, else 0
df['Been Bankrupt?'] = df['Bankruptcies'].apply(lambda x: 0 if x == 0 else 1)
# Creating 'Had Tax Lien?' Feature
# 1 if Had Tax Lien, else 0
df['Had Tax Lien?'] = df['Tax Liens'].apply(lambda x: 0 if x==0 else 1)
df.info()
# For Monthly Debt, Remove '$' and ',' Characters
df['Monthly Debt'] = df['Monthly Debt'].str.replace('$','')
df['Monthly Debt'] = df['Monthly Debt'].str.replace(',','')
df['Monthly Debt'] = map(float, df['Monthly Debt'])
df['Maximum Open Credit'] = map(int, df['Maximum Open Credit'])
df.info()
corr = df.corr()
fig, ax = plt.subplots(figsize=(14,14))
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, ax=ax)
def plotStackedBars(xbars, ybars, df):
plot_df = pd.DataFrame()
for value in df[xbars].unique():
x = df[df[xbars]==value][ybars].value_counts()
plot_df = plot_df.append(x)
plot_df.index = df[xbars].unique()
return plot_df
# Purpose
plotStackedBars('Purpose','Loan Status',df).plot(kind='bar', stacked=True, rot=0, figsize=(14,6), color = ['red', 'green'])
# Years in current job
plotStackedBars('Years in current job','Loan Status',df).plot(kind='bar', stacked=True, rot=0, figsize=(14,6), color = ['red', 'green'])
# Term
plotStackedBars('Term','Loan Status',df).plot(kind='bar', stacked=True, rot=0, figsize=(14,6), color = ['red', 'green'])
fig = plt.figure(figsize=(10,6))
plt.hist([df[df['Loan Status'] == 'Charged Off']['Credit Score'], df[df['Loan Status'] == 'Fully Paid']['Credit Score']], stacked=True, color= ['red', 'green'], bins=30)
plt.xlabel('Credit Score')
plt.ylabel('Number of Loans')
plt.legend()
plt.show()
fig = plt.figure(figsize=(10,6))
plt.hist([df[df['Loan Status'] == 'Charged Off']['Monthly Debt'], df[df['Loan Status'] == 'Fully Paid']['Monthly Debt']], stacked=True, bins=50, color= ['red', 'green'])
plt.xlabel('Monthly Debt')
plt.ylabel('Number of Loans')
plt.legend()
plt.show()
df.info()
d = {'Fully Paid':1,'Charged Off':0}
df['Loan Status'] = df['Loan Status'].map(d)
print df['Loan Status'].value_counts()
df = pd.get_dummies(df, columns = ['Term', 'Years in current job', 'Home Ownership', 'Purpose'], drop_first=True)
df.head()
y = df['Loan Status']
X = df.drop(['Loan Status'], axis=1)
X_train, X_test, y_train,y_test = tts(X,y,train_size=0.8, random_state=17)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from sklearn import grid_search
logRegModel = LogisticRegression().fit(X_train, y_train)
predictionLR = logRegModel.predict(X_test)
accScoreLR = accuracy_score(y_test, predictionLR)
f1ScoreLR = f1_score(y_test, predictionLR)
print 'Logistic Regression Accuracy Score:', accScoreLR
print 'Logistic Regression F1 Score:', f1ScoreLR
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier as GBC
clfGB = GBC(n_estimators=10).fit(X_train, y_train)
predictionGB = clfGB.predict(X_test)
accScoreGB = accuracy_score(y_test, predictionGB)
f1ScoreGB = f1_score(y_test, predictionGB)
print 'Gradient Boosting Accuracy Score:', accScoreGB
print 'Gradient Boosting F1 Score:', f1ScoreGB
# Random Forest
from sklearn.ensemble import RandomForestClassifier as RFC
clfRF = RFC(n_estimators=10).fit(X_train, y_train)
predictionRF = clfRF.predict(X_test)
accScoreRF = accuracy_score(y_test, predictionRF)
f1ScoreRF = f1_score(y_test, predictionRF)
print 'Random Forest Accuracy Score:', accScoreRF
print 'Random Forest F1 Score:', f1ScoreRF
from sklearn.metrics import confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
# This function prints the confusion matrix
# Normalization can be applied by setting it to true
plt.imshow(cm, interpolation= 'nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
if normalize==True:
cm = cm.astype(float)/cm.sum(axis=1)[:, np.newaxis]
print 'Normalized Confusion Matrix'
else:
print 'Confusion Matrix without Normalization'
print cm
thresh = cm.max()/2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i,j], horizontalalignment='center', color='white' if cm[i,j] > thresh else 'black')
plt.tight_layout()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
cnf_matrix = confusion_matrix(y_test, predictionLR)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Loan Default', 'Fully Paid']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Normalized Confusion Matrix')
plt.show()
from sklearn import metrics
predsLR = logRegModel.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsLR)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
from ggplot import *
predsLR = logRegModel.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsLR)
ROCdfLR = pd.DataFrame(dict(fpr=fpr, tpr=tpr))
ggplot(ROCdfLR, aes(x='fpr', y='tpr')) + geom_line() + geom_abline(linetype='dashed') + ggtitle("ROC Curve")
auc = metrics.auc(fpr, tpr)
ggplot(ROCdfLR, aes(x='fpr', y='tpr', ymin='0', ymax='tpr')) + geom_area(alpha=.2) + geom_line(y='tpr') + ggtitle("ROC Curve w/ AUC= " + str(auc))