To use the given data to create a classifcation algorithm that will accurately predict if a device will fail or not.
In this notebook, I manipulate the data for tree-based algorithms. I also use over-sampling due to the imbalance of device failures.
import numpy as np
from sklearn.cross_validation import train_test_split as tts
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
from datetime import datetime
%matplotlib inline
df = pd.read_csv('failures.csv')
df.head()
df.info()
df.corr()
corr = df.corr()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, ax=ax)
df.describe()
df.nunique()
# Columns could be categorical data
fig = plt.figure(figsize=(7,10))
ax1 = plt.subplot(331)
ax2 = plt.subplot(332)
ax3 = plt.subplot(333)
ax4 = plt.subplot(334)
ax5 = plt.subplot(335)
ax6 = plt.subplot(336)
ax7 = plt.subplot(337)
ax8 = plt.subplot(338)
ax9 = plt.subplot(339)
df.boxplot(column='attribute1',by='failure',ax=ax1)
df.boxplot(column='attribute2',by='failure',ax=ax2)
df.boxplot(column='attribute3',by='failure',ax=ax3)
df.boxplot(column='attribute4',by='failure',ax=ax4)
df.boxplot(column='attribute5',by='failure',ax=ax5)
df.boxplot(column='attribute6',by='failure',ax=ax6)
df.boxplot(column='attribute7',by='failure',ax=ax7)
df.boxplot(column='attribute8',by='failure',ax=ax8)
df.boxplot(column='attribute9',by='failure',ax=ax9)
plt.suptitle('')
plt.tight_layout()
df.attribute1.value_counts()
# Continuous variables create problems for Tree Based Algorithms. Creates pure tables too quickly
# Removing Duplicates
print df.duplicated().sum() # No duplicated rows
df.drop('attribute8',axis=1,inplace=True) #attribute 8 is a duplicate of attribute7
df.head()
# Have an Imbalanced Data Set
numfailure0 = len(df[df.failure==0]) #len(df.failure==0)
numfailure1 = len(df[df.failure==1])
total = numfailure0 + numfailure1
print numfailure0, numfailure1, total
df.failure.value_counts(normalize=True)
print df.date.nunique(), df.device.nunique()
print df.date.nunique()*df.device.nunique()/float(total)
df.date.value_counts()
# Does not appear to be an obvious pattern between
df[df.failure==1].head(20)
# It appears measurement are taken until the device fails, since the last measurement is the failure
print df[df.failure==1].device.duplicated().sum()
print df[df.device=='S1F0P3G2']
print df[df.device=='W1F0P114']
# We must assume all devices are created equal. Thus a combination of the attibutes and maybe 'time' attibutes to failure
# Creating a column to measure time called 'daysActive
df.date = pd.to_datetime(df.date)
df['daysActive'] = (df.date-df.date[0])
df.daysActive = df.daysActive.dt.days
df.head()
# Creating a feature called 'season'
# Using Northen Hemisphere Seasonality
season = []
for date in df.date:
if date < datetime(2015,3,1):
season.append('winter')
elif date < datetime(2015,6,1):
season.append('spring')
elif date < datetime(2015,9,1):
season.append('summer')
else:
season.append('fall')
df['season'] = season
df.head()
df.drop(['date','device'],axis=1,inplace=True)
df.info()
print 'Skew for {a} is: {b}'.format(a='attribute1', b=df.attribute1.skew())
print 'Skew for {a} is: {b}'.format(a='attribute6', b=df.attribute6.skew())
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(121)
ax2 = plt.subplot(122)
df.hist(column='attribute1',ax=ax1)
df.hist(column='attribute6',ax=ax2)
df['attribute1Binned'] = pd.qcut(df.attribute1, 20, labels=False)
df['attribute6Binned'] = pd.qcut(df.attribute6, 20, labels=False)
df['daysActiveBinned'] = pd.qcut(df.daysActive, 20, labels=False)
df.attribute6Binned
df.drop('attribute1', axis=1, inplace=True)
df.drop('attribute6', axis=1, inplace=True)
df.drop('daysActive', axis=1, inplace=True)
df.info()
df = pd.get_dummies(df,columns=['season'],drop_first=True)
df.head()
X = df.drop('failure', axis=1, inplace=False)
y = df.failure
print X.shape, y.shape
from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, train_size=0.8,random_state=123, stratify=y) # Stratify Data to Keep Test Balance
print X_train.shape, X_test.shape, y_train.shape, y_test.shape
sm = SMOTE(random_state=123, ratio=1.0)
X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)
print X_train_smote.shape, y_train_smote.shape
sm = SMOTEENN(random_state=123, ratio=1.0)
X_train_smoteenn, y_train_smoteenn = sm.fit_sample(X_train, y_train)
print X_train_smoteenn.shape, y_train_smoteenn.shape
from sklearn.ensemble import RandomForestClassifier as RFC
rf_smote = RFC(random_state=123, n_estimators=100).fit(X_train_smote, y_train_smote)
pred_rf_smote = rf_smote.predict(X_test)
accScore_rf_smote = accuracy_score(y_test, pred_rf_smote)
precisionScore_rf_smote = precision_score(y_test, pred_rf_smote)
recallScore_rf_smote = recall_score(y_test, pred_rf_smote)
print 'Over-Sampling (SMOTE) Random Forests: \nAccuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.\n'.format(a=accScore_rf_smote, b=precisionScore_rf_smote, c=recallScore_rf_smote)
from sklearn.model_selection import KFold
kf = KFold(n_splits = 10)
accuracy = []
precision = []
recall = []
model = RFC(n_estimators=100)
for train_index, test_index in kf.split(X):
print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
Xtrain, Xtest = X_train_smote[train_index], X_train_smote[test_index]
ytrain, ytest = y_train_smote[train_index], y_train_smote[test_index]
pred = model.fit(Xtrain, ytrain).predict(Xtest)
accuracy.append(accuracy_score(ytest, pred))
precision.append(precision_score(ytest, pred))
recall.append(recall_score(ytest, pred))
print 'Over-Sampling (SMOTE) Random Forests - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
from sklearn.metrics import confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
# This function prints the confusion matrix
# Normalization can be applied by setting it to true
plt.imshow(cm, interpolation= 'nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
if normalize==True:
cm = cm.astype(float)/cm.sum(axis=1)[:, np.newaxis]
print 'Normalized Confusion Matrix'
else:
print 'Confusion Matrix without Normalization'
print cm
thresh = cm.max()/2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i,j], horizontalalignment='center', color='white' if cm[i,j] > thresh else 'black')
plt.tight_layout()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
cnf_matrix = confusion_matrix(y_test, pred_rf_smote)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Random Forests (SMOTE) - Normalized Confusion Matrix')
plt.show()
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, pred_rf_smote)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Random Forests (SMOTE) - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_rf_smote))
from sklearn import metrics
predsRF_smote = rf_smote.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsRF_smote)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Random Forests (SMOTE) - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
rf_smoteenn = RFC(random_state=123, n_estimators=100).fit(X_train_smoteenn, y_train_smoteenn)
pred_rf_smoteenn = rf_smote.predict(X_test)
accScore_rf_smoteenn = accuracy_score(y_test, pred_rf_smoteenn)
precisionScore_rf_smoteenn = precision_score(y_test, pred_rf_smoteenn)
recallScore_rf_smoteenn = recall_score(y_test, pred_rf_smoteenn)
print 'Over-Sampling (SMOTEENN) Random Forests: \nAccuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.\n'.format(a=accScore_rf_smoteenn, b=precisionScore_rf_smoteenn, c=recallScore_rf_smoteenn)
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
accuracy = []
precision = []
recall = []
model = RFC(n_estimators=100)
for train_index, test_index in kf.split(X):
print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
Xtrain, Xtest = X_train_smoteenn[train_index], X_train_smoteenn[test_index]
ytrain, ytest = y_train_smoteenn[train_index], y_train_smoteenn[test_index]
pred = model.fit(Xtrain, ytrain).predict(Xtest)
accuracy.append(accuracy_score(ytest, pred))
precision.append(precision_score(ytest, pred))
recall.append(recall_score(ytest, pred))
print 'Over-Sampling (SMOTEENN) Random Forests - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
cnf_matrix = confusion_matrix(y_test, pred_rf_smoteenn)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Random Forests (SMOTE) - Normalized Confusion Matrix')
plt.show()
precision, recall, _ = precision_recall_curve(y_test, pred_rf_smoteenn)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Random Forests (SMOTEENN) - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_rf_smote))
from sklearn import metrics
predsRF_smoteenn = rf_smoteenn.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsRF_smoteenn)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Random Forests (SMOTEENN) - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
from sklearn.ensemble import GradientBoostingClassifier as GBC
gbc_smote = GBC(random_state=123, n_estimators=100).fit(X_train_smote, y_train_smote)
pred_gbc_smote = gbc_smote.predict(X_test)
accScore_gbc_smote = accuracy_score(y_test, pred_gbc_smote)
precisionScore_gbc_smote = precision_score(y_test, pred_gbc_smote)
recallScore_gbc_smote = recall_score(y_test, pred_gbc_smote)
print 'Over-Sampling (SMOTE) Gradient Boosting: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_gbc_smote, b=precisionScore_gbc_smote, c=recallScore_gbc_smote)
#from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
accuracy = []
precision = []
recall = []
model = GBC(n_estimators=100)
for train_index, test_index in kf.split(X):
print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
Xtrain, Xtest = X_train_smote[train_index], X_train_smote[test_index]
ytrain, ytest = y_train_smote[train_index], y_train_smote[test_index]
pred = model.fit(Xtrain, ytrain).predict(Xtest)
accuracy.append(accuracy_score(ytest, pred))
precision.append(precision_score(ytest, pred))
recall.append(recall_score(ytest, pred))
print 'Over-Sampling (SMOTE) Gradient Boosting - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
cnf_matrix = confusion_matrix(y_test, pred_gbc_smote)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Gradient Boosting (SMOTE) - Normalized Confusion Matrix')
plt.show()
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, pred_gbc_smote)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Gradient Boosting (SMOTE) - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_gbc_smote))
from sklearn import metrics
predsGB_smote = gbc_smote.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsGB_smote)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Gradient Boosting - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
gbc_smoteenn = GBC(random_state=123, n_estimators=100).fit(X_train_smoteenn, y_train_smoteenn)
pred_gbc_smoteenn = gbc_smoteenn.predict(X_test)
accScore_gbc_smoteenn = accuracy_score(y_test, pred_gbc_smoteenn)
precisionScore_gbc_smoteenn = precision_score(y_test, pred_gbc_smoteenn)
recallScore_gbc_smoteenn = recall_score(y_test, pred_gbc_smoteenn)
print 'Over-Sampling (SMOTEENN) Gradient Boosting: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_gbc_smoteenn, b=precisionScore_gbc_smoteenn, c=recallScore_gbc_smoteenn)
#from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
accuracy = []
precision = []
recall = []
model = GBC(n_estimators=100)
for train_index, test_index in kf.split(X):
print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
Xtrain, Xtest = X_train_smoteenn[train_index], X_train_smoteenn[test_index]
ytrain, ytest = y_train_smoteenn[train_index], y_train_smoteenn[test_index]
pred = model.fit(Xtrain, ytrain).predict(Xtest)
accuracy.append(accuracy_score(ytest, pred))
precision.append(precision_score(ytest, pred))
recall.append(recall_score(ytest, pred))
print 'Over-Sampling (SMOTEENN) Gradient Boosting - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
cnf_matrix = confusion_matrix(y_test, pred_gbc_smoteenn)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Gradient Boosting (SMOTEENN) - Normalized Confusion Matrix')
plt.show()
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, pred_gbc_smoteenn)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Gradient Boosting (SMOTEENN) - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_gbc_smoteenn))
from sklearn import metrics
predsGB_smoteenn = gbc_smoteenn.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsGB_smoteenn)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Gradient Boosting (SMOTEENN) - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()