To use the given data to create a classifcation algorithm that will accurately predict if a device will fail or not.
In this notebook, I will manipulate the data for traditionally regression algorithms. I will also use undersampling due to the imbalance of device failures.
import numpy as np
from sklearn.cross_validation import train_test_split as tts
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
from datetime import datetime
%matplotlib inline
df = pd.read_csv('failures.csv')
df.head()
df.info()
df.corr()
corr = df.corr()
fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, ax=ax)
df.describe()
df.nunique()
# Columns could be categorical data
fig = plt.figure(figsize=(7,10))
ax1 = plt.subplot(331)
ax2 = plt.subplot(332)
ax3 = plt.subplot(333)
ax4 = plt.subplot(334)
ax5 = plt.subplot(335)
ax6 = plt.subplot(336)
ax7 = plt.subplot(337)
ax8 = plt.subplot(338)
ax9 = plt.subplot(339)
df.boxplot(column='attribute1',by='failure',ax=ax1)
df.boxplot(column='attribute2',by='failure',ax=ax2)
df.boxplot(column='attribute3',by='failure',ax=ax3)
df.boxplot(column='attribute4',by='failure',ax=ax4)
df.boxplot(column='attribute5',by='failure',ax=ax5)
df.boxplot(column='attribute6',by='failure',ax=ax6)
df.boxplot(column='attribute7',by='failure',ax=ax7)
df.boxplot(column='attribute8',by='failure',ax=ax8)
df.boxplot(column='attribute9',by='failure',ax=ax9)
plt.suptitle('')
plt.tight_layout()
df.attribute1.value_counts()
# Continuous variables create problems for Tree Based Algorithms. Creates pure tables too quickly
# Removing Duplicates
print df.duplicated().sum() # No duplicated rows
df.drop('attribute8',axis=1,inplace=True) #attribute 8 is a duplicate of attribute7
df.head()
# Have an Imbalanced Data Set
numfailure0 = len(df[df.failure==0]) #len(df.failure==0)
numfailure1 = len(df[df.failure==1])
total = numfailure0 + numfailure1
print numfailure0, numfailure1, total
df.failure.value_counts(normalize=True)
print df.date.nunique(), df.device.nunique()
print df.date.nunique()*df.device.nunique()/float(total)
df.date.value_counts()
# Does not appear to be an obvious pattern between
df[df.failure==1].head(20)
# It appears measurement are taken until the device fails, since the last measurement is the failure
print df[df.failure==1].device.duplicated().sum()
print df[df.device=='S1F0P3G2']
print df[df.device=='W1F0P114']
# We must assume all devices are created equal. Thus a combination of the attibutes and maybe 'time' attibutes to failure
# Creating a column to measure time called 'daysActive
df.date = pd.to_datetime(df.date)
df['daysActive'] = (df.date-df.date[0])
df.daysActive = df.daysActive.dt.days
df.head()
# Creating a feature called 'season'
# Using Northen Hemisphere Seasonality
season = []
for date in df.date:
if date < datetime(2015,3,1):
season.append('winter')
elif date < datetime(2015,6,1):
season.append('spring')
elif date < datetime(2015,9,1):
season.append('summer')
else:
season.append('fall')
df['season'] = season
df.head()
df.drop(['date','device'],axis=1,inplace=True)
df.info()
# Normalize Attributes 1 & 6
attribute1max = float(df.attribute1.max())
attribute1min = float(df.attribute1.min())
attribute6max = float(df.attribute6.max())
attribute6min = float(df.attribute6.min())
df.attribute1 = df.attribute1.apply(lambda x: (x - attribute1min)/(attribute1max - attribute1min))
df.attribute6 = df.attribute6.apply(lambda x: (x - attribute6min)/(attribute6max - attribute6min))
print 'Skew for {a} is: {b}'.format(a='attribute1', b=df.attribute1.skew())
print 'Skew for {a} is: {b}'.format(a='attribute6', b=df.attribute6.skew())
fig = plt.figure(figsize=(10,10))
ax1 = plt.subplot(121)
ax2 = plt.subplot(122)
df.hist(column='attribute1',ax=ax1)
df.hist(column='attribute6',ax=ax2)
df.info()
df = pd.get_dummies(df,columns=['season','attribute2','attribute3','attribute4','attribute5','attribute7','attribute9'],drop_first=True)
df.head()
X = df.drop('failure', axis=1, inplace=False)
y = df.failure
rus = RandomUnderSampler(ratio=1.0,random_state=123)
X, y = rus.fit_sample(X, y)
print X.shape, y.shape
from sklearn.cross_validation import train_test_split as tts
X_train, X_test, y_train, y_test = tts(X, y, train_size=0.8,random_state=123, stratify=y) # Stratify Data to Keep Test Balance
print X_train.shape, X_test.shape, y_train.shape, y_test.shape
from sklearn.linear_model import LogisticRegression as LR
lr = LR(random_state=123).fit(X_train, y_train)
pred_lr = lr.predict(X_test)
accScore_lr = accuracy_score(y_test, pred_lr)
precisionScore_lr = precision_score(y_test, pred_lr)
recallScore_lr = recall_score(y_test, pred_lr)
print 'Under-Sampling Logistic Regression: \nAccuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.\n'.format(a=accScore_lr, b=precisionScore_lr, c=recallScore_lr)
from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
accuracy = []
precision = []
recall = []
model = LR()
for train_index, test_index in kf.split(X):
print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
pred = model.fit(Xtrain, ytrain).predict(Xtest)
accuracy.append(accuracy_score(ytest, pred))
precision.append(precision_score(ytest, pred))
recall.append(recall_score(ytest, pred))
print 'Under-Sampling Logistic Regression - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
from sklearn.metrics import confusion_matrix
import itertools
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
# This function prints the confusion matrix
# Normalization can be applied by setting it to true
plt.imshow(cm, interpolation= 'nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=0)
plt.yticks(tick_marks, classes)
if normalize==True:
cm = cm.astype(float)/cm.sum(axis=1)[:, np.newaxis]
print 'Normalized Confusion Matrix'
else:
print 'Confusion Matrix without Normalization'
print cm
thresh = cm.max()/2
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i,j], horizontalalignment='center', color='white' if cm[i,j] > thresh else 'black')
plt.tight_layout()
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
cnf_matrix = confusion_matrix(y_test, pred_lr)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Logistic Regression - Normalized Confusion Matrix')
plt.show()
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, pred_lr)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Logistic Regression - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_lr))
from sklearn import metrics
predsLR = lr.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsLR)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Logistic Regression - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
from sklearn.linear_model import RidgeClassifier
ridge = RidgeClassifier(random_state=123).fit(X_train, y_train)
pred_ridge = ridge.predict(X_test)
accScore_ridge = accuracy_score(y_test, pred_ridge)
precisionScore_ridge = precision_score(y_test, pred_ridge)
recallScore_ridge = recall_score(y_test, pred_ridge)
print 'Under-Sampling Ridge Regression: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_ridge, b=precisionScore_ridge, c=recallScore_ridge)
#from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
accuracy = []
precision = []
recall = []
model = RidgeClassifier()
for train_index, test_index in kf.split(X):
print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
pred = model.fit(Xtrain, ytrain).predict(Xtest)
accuracy.append(accuracy_score(ytest, pred))
precision.append(precision_score(ytest, pred))
recall.append(recall_score(ytest, pred))
print 'Under-Sampling Ridge Classifier - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
cnf_matrix = confusion_matrix(y_test, pred_ridge)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Ridge Classifier - Normalized Confusion Matrix')
plt.show()
from sklearn.metrics import precision_recall_curve
precision, recall, _ = precision_recall_curve(y_test, pred_ridge)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Ridge Classifier - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_ridge))
'''from sklearn import metrics
predsRidge = ridge.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsRidge)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Ridge Classifier - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()'''
from sklearn.linear_model import SGDClassifier
sgd = SGDClassifier(random_state=123).fit(X_train, y_train)
pred_sgd = sgd.predict(X_test)
accScore_sgd = accuracy_score(y_test, pred_sgd)
precisionScore_sgd = precision_score(y_test, pred_sgd)
recallScore_sgd = recall_score(y_test, pred_sgd)
print 'Under-Sampling Stochastic Gradient Descent: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_sgd, b=precisionScore_sgd, c=recallScore_sgd)
#from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
accuracy = []
precision = []
recall = []
model = SGDClassifier()
for train_index, test_index in kf.split(X):
print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
pred = model.fit(Xtrain, ytrain).predict(Xtest)
accuracy.append(accuracy_score(ytest, pred))
precision.append(precision_score(ytest, pred))
recall.append(recall_score(ytest, pred))
print 'Under-Sampling Stochastic Gradient Descent - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
cnf_matrix = confusion_matrix(y_test, pred_sgd)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Stochastic Gradient Descent - Normalized Confusion Matrix')
plt.show()
precision, recall, _ = precision_recall_curve(y_test, pred_sgd)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Stochastic Gradient Descent - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_sgd))
'''predsSGD = sgd.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predSGD)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Stochastic Gradient Descent - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()'''
from sklearn.svm import SVC
svc = SVC(random_state=123, probability=True).fit(X_train, y_train)
pred_svc = svc.predict(X_test)
accScore_svc = accuracy_score(y_test, pred_svc)
precisionScore_svc = precision_score(y_test, pred_svc)
recallScore_svc = recall_score(y_test, pred_svc)
print 'Under-Sampling Support Vector Machine: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_svc, b=precisionScore_svc, c=recallScore_svc)
#from sklearn.model_selection import KFold
kf = KFold(n_splits = 5)
accuracy = []
precision = []
recall = []
model = SVC()
for train_index, test_index in kf.split(X):
print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
Xtrain, Xtest = X[train_index], X[test_index]
ytrain, ytest = y[train_index], y[test_index]
pred = model.fit(Xtrain, ytrain).predict(Xtest)
accuracy.append(accuracy_score(ytest, pred))
precision.append(precision_score(ytest, pred))
recall.append(recall_score(ytest, pred))
print 'Under-Sampling Support Vector Machine Classifier - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
cnf_matrix = confusion_matrix(y_test, pred_svc)
np.set_printoptions(precision=2)
plt.figure()
class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Support Vector Machine Classifier - Normalized Confusion Matrix')
plt.show()
precision, recall, _ = precision_recall_curve(y_test, pred_svc)
plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Support Vector Machine Classifier - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_sgd))
predsSVC = svc.predict_proba(X_test)[:,1]
fpr, tpr, _ = metrics.roc_curve(y_test, predsSVC)
roc_auc = metrics.auc(fpr, tpr)
plt.title('Support Vector Machine Classifier - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()