Project - Device Failures (Under Sampling Regression)¶

Purpose/Goal:¶

To use the given data to create a classifcation algorithm that will accurately predict if a device will fail or not.

In this notebook, I will manipulate the data for traditionally regression algorithms. I will also use undersampling due to the imbalance of device failures.

import numpy as np
from sklearn.cross_validation import train_test_split as tts
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
from datetime import datetime
%matplotlib inline

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

df = pd.read_csv('failures.csv')

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
date          124494 non-null object
device        124494 non-null object
failure       124494 non-null int64
attribute1    124494 non-null int64
attribute2    124494 non-null int64
attribute3    124494 non-null int64
attribute4    124494 non-null int64
attribute5    124494 non-null int64
attribute6    124494 non-null int64
attribute7    124494 non-null int64
attribute8    124494 non-null int64
attribute9    124494 non-null int64
dtypes: int64(10), object(2)
memory usage: 11.4+ MB

df.corr()

corr = df.corr()

fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, ax=ax)

<matplotlib.axes._subplots.AxesSubplot at 0x1fbd9e10>

df.describe()

df.nunique()
# Columns could be categorical data

date             304
device          1168
failure            2
attribute1    123878
attribute2       558
attribute3        47
attribute4       115
attribute5        60
attribute6     44838
attribute7        28
attribute8        28
attribute9        65
dtype: int64

fig = plt.figure(figsize=(7,10))

ax1 = plt.subplot(331)
ax2 = plt.subplot(332)
ax3 = plt.subplot(333)
ax4 = plt.subplot(334)
ax5 = plt.subplot(335)
ax6 = plt.subplot(336)
ax7 = plt.subplot(337)
ax8 = plt.subplot(338)
ax9 = plt.subplot(339)

df.boxplot(column='attribute1',by='failure',ax=ax1)
df.boxplot(column='attribute2',by='failure',ax=ax2)
df.boxplot(column='attribute3',by='failure',ax=ax3)
df.boxplot(column='attribute4',by='failure',ax=ax4)
df.boxplot(column='attribute5',by='failure',ax=ax5)
df.boxplot(column='attribute6',by='failure',ax=ax6)
df.boxplot(column='attribute7',by='failure',ax=ax7)
df.boxplot(column='attribute8',by='failure',ax=ax8)
df.boxplot(column='attribute9',by='failure',ax=ax9)

plt.suptitle('')
plt.tight_layout()

df.attribute1.value_counts()

# Continuous variables create problems for Tree Based Algorithms. Creates pure tables too quickly

165048912    26
57192360     26
89196552     26
169490248    23
165040624    15
57180136     15
89162648     15
12194976     15
169467344    15
165045144    13
89179832     13
0            11
57184544      7
169474944     7
57187976      6
169477432     6
202325472     5
120526024     4
170202672     4
188259592     4
51974040      4
185269928     4
100633152     4
159379088     4
5702496       4
49629347      4
162347792     4
148434224     4
120570664     4
179373176     4
             ..
240315144     1
135395304     1
148289536     1
140383496     1
238793608     1
5140408       1
236795888     1
142033792     1
189816000     1
127402656     1
175656336     1
6328824       1
147421176     1
59359680      1
131723320     1
112735064     1
130283560     1
87835960      1
12777480      1
67957256      1
116992624     1
153237520     1
17206176      1
64124664      1
4356120       1
137755216     1
207337504     1
36671416      1
206450608     1
81160776      1
Name: attribute1, Length: 123878, dtype: int64

# Removing Duplicates

print df.duplicated().sum() # No duplicated rows

df.drop('attribute8',axis=1,inplace=True) #attribute 8 is a duplicate of attribute7
df.head()

0

# Have an Imbalanced Data Set

numfailure0 = len(df[df.failure==0]) #len(df.failure==0)
numfailure1 = len(df[df.failure==1])
total = numfailure0 + numfailure1

print numfailure0, numfailure1, total
df.failure.value_counts(normalize=True)

124388 106 124494

0    0.999149
1    0.000851
Name: failure, dtype: float64

print df.date.nunique(), df.device.nunique()
print df.date.nunique()*df.device.nunique()/float(total)

df.date.value_counts()

# Does not appear to be an obvious pattern between

304 1168
2.85212138738

2015-01-01    1163
2015-01-02    1163
2015-01-03    1163
2015-01-04    1162
2015-01-05    1161
2015-01-06    1054
2015-01-07     798
2015-01-09     756
2015-01-08     756
2015-01-10     755
2015-01-11     755
2015-01-12     755
2015-01-13     755
2015-01-14     716
2015-01-15     715
2015-01-17     715
2015-01-16     715
2015-01-29     715
2015-01-18     714
2015-01-20     713
2015-01-19     713
2015-02-08     712
2015-02-09     712
2015-02-07     712
2015-02-04     712
2015-02-05     712
2015-02-02     712
2015-02-03     712
2015-02-01     712
2015-02-14     712
              ...
2015-09-05     146
2015-09-06     146
2015-09-07     146
2015-10-06     141
2015-10-09     141
2015-10-08     141
2015-10-07     141
2015-10-12     140
2015-10-10     140
2015-10-11     140
2015-09-03     115
2015-10-13     111
2015-10-14     111
2015-10-17     109
2015-10-15     109
2015-10-19     109
2015-10-16     109
2015-10-18     109
2015-10-21      69
2015-10-22      69
2015-10-20      69
2015-10-26      32
2015-10-25      32
2015-10-24      32
2015-10-23      32
2015-10-29      31
2015-10-31      31
2015-10-30      31
2015-10-27      31
2015-11-02      31
Name: date, Length: 304, dtype: int64

df[df.failure==1].head(20)

# It appears measurement are taken until the device fails, since the last measurement is the failure

print df[df.failure==1].device.duplicated().sum()
print df[df.device=='S1F0P3G2']
print df[df.device=='W1F0P114']

0
             date    device  failure  attribute1  attribute2  attribute3  \
143    2015-01-01  S1F0P3G2        0    48342824           0           0
1306   2015-01-02  S1F0P3G2        0    67760976           0           0
2469   2015-01-03  S1F0P3G2        0    90052720           0           0
3632   2015-01-04  S1F0P3G2        0   111839288           0           0
4793   2015-01-05  S1F0P3G2        0   136067776           0           0
5918   2015-01-06  S1F0P3G2        0   156791032           0           0
6936   2015-01-07  S1F0P3G2        0   181020736           0           0
7733   2015-01-08  S1F0P3G2        0   201764136           0           0
8489   2015-01-09  S1F0P3G2        0   228380344           0           0
9245   2015-01-10  S1F0P3G2        0     6166904           0           0
10000  2015-01-11  S1F0P3G2        0    28846072           0           0
10755  2015-01-12  S1F0P3G2        0    49137600           0           0
11510  2015-01-13  S1F0P3G2        0    75091544           0           0
12265  2015-01-14  S1F0P3G2        0    94906992           0           0
12981  2015-01-15  S1F0P3G2        0   114669384           0           0
13696  2015-01-16  S1F0P3G2        0   140000456           0           0
14411  2015-01-17  S1F0P3G2        0   119162104           0           0
15126  2015-01-18  S1F0P3G2        0   200708872         328           0
15839  2015-01-19  S1F0P3G2        0    45538552        2848           0
16554  2015-01-20  S1F0P3G2        1   178102642       10288           0

       attribute4  attribute5  attribute6  attribute7  attribute9
143             0           8      241097           0           1
1306            0           8      242452           0           1
2469            0           8      243829           0           1
3632            0           8      245067           0           1
4793            0           8      246390           0           1
5918            0           8      247802           0           1
6936            0           8      249129           0           1
7733            0           8      250461           0           1
8489            0           8      251675           0           1
9245            0           8      253036           0           1
10000           0           8      254357           0           1
10755           0           8      255693           0           1
11510           0           8      256979           0           1
12265           0           8      258306           0           1
12981           0           8      259648           0           1
13696           0           8      260926           0           1
14411          12           8      261020           0           1
15126          19           8      261020           0           1
15839          29           8      261020         136           1
16554          48           8      261020         424           1
             date    device  failure  attribute1  attribute2  attribute3  \
636    2015-01-01  W1F0P114        0    59684104           0           0
1799   2015-01-02  W1F0P114        0    84034728           0           0
2962   2015-01-03  W1F0P114        0   105642752           0           0
4125   2015-01-04  W1F0P114        0   128775712           0           0
5286   2015-01-05  W1F0P114        0   150367752           0           0
6392   2015-01-06  W1F0P114        0   173205864           0           0
7293   2015-01-07  W1F0P114        0   196586856           0           0
8057   2015-01-08  W1F0P114        0   217842552           0           0
8813   2015-01-09  W1F0P114        0   240175208           0           0
9569   2015-01-10  W1F0P114        0    82812176           0           0
10324  2015-01-11  W1F0P114        0     4354704           0           0
11079  2015-01-12  W1F0P114        0    27727272           0           0
11834  2015-01-13  W1F0P114        0   211767464           0           0
12565  2015-01-14  W1F0P114        0   183133992           0           0
13281  2015-01-15  W1F0P114        0   140731048           0           0
13996  2015-01-16  W1F0P114        0    68931064           0           0
14711  2015-01-17  W1F0P114        0   150261624           0           0
15426  2015-01-18  W1F0P114        0   167907000           0           0
16139  2015-01-19  W1F0P114        0   191519505           0           0
16855  2015-01-20  W1F0P114        0   215121217           0           0
17567  2015-01-21  W1F0P114        0     3207048           0           0
18279  2015-01-22  W1F0P114        0   109931016           0           0
18991  2015-01-23  W1F0P114        1   192865952           0           0

       attribute4  attribute5  attribute6  attribute7  attribute9
636             6          35      248464           0           0
1799            6          35      249831           0           0
2962            6          35      251159           0           0
4125            6          35      252484           0           0
5286            6          35      253790           0           0
6392            6          35      255106           0           0
7293            6          35      256482           0           0
8057            6          35      257807           0           0
8813            6          35      259118           0           0
9569            6          36      259747           0           0
10324           7          36      260106           0           0
11079           7          36      261447           0           0
11834           7          36      262654           0           0
12565           7          36      263384           0           0
13281           7          36      263384           0           0
13996           7          36      263384           0           0
14711           8          36      264721           0           0
15426           8          36      266076           0           0
16139           9          36      267404           0           0
16855           9          36      268734           0           0
17567           9          36      270117           0           0
18279           9          36      270678           0           0
18991           9          36      271129           0           0

# We must assume all devices are created equal. Thus a combination of the attibutes and maybe 'time' attibutes to failure
# Creating a column to measure time called 'daysActive

df.date = pd.to_datetime(df.date)
df['daysActive'] = (df.date-df.date[0])
df.daysActive = df.daysActive.dt.days
df.head()

# Creating a feature called 'season'
# Using Northen Hemisphere Seasonality

season = []

for date in df.date:
    if date < datetime(2015,3,1):
        season.append('winter')
    elif date < datetime(2015,6,1):
        season.append('spring')
    elif date  < datetime(2015,9,1):
        season.append('summer')
    else:
        season.append('fall')

df['season'] = season

df.head()

df.drop(['date','device'],axis=1,inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 11 columns):
failure       124494 non-null int64
attribute1    124494 non-null int64
attribute2    124494 non-null int64
attribute3    124494 non-null int64
attribute4    124494 non-null int64
attribute5    124494 non-null int64
attribute6    124494 non-null int64
attribute7    124494 non-null int64
attribute9    124494 non-null int64
daysActive    124494 non-null int64
season        124494 non-null object
dtypes: int64(10), object(1)
memory usage: 10.4+ MB

Cleaning Data for Regression Algorithms¶

# Normalize Attributes 1 & 6

attribute1max = float(df.attribute1.max())
attribute1min = float(df.attribute1.min())
attribute6max = float(df.attribute6.max())
attribute6min = float(df.attribute6.min())

df.attribute1 = df.attribute1.apply(lambda x: (x - attribute1min)/(attribute1max - attribute1min))
df.attribute6 = df.attribute6.apply(lambda x: (x - attribute6min)/(attribute6max - attribute6min))

print 'Skew for {a} is: {b}'.format(a='attribute1', b=df.attribute1.skew())
print 'Skew for {a} is: {b}'.format(a='attribute6', b=df.attribute6.skew())

fig = plt.figure(figsize=(10,10))

ax1 = plt.subplot(121)
ax2 = plt.subplot(122)

df.hist(column='attribute1',ax=ax1)
df.hist(column='attribute6',ax=ax2)

Skew for attribute1 is: -0.0111148073703
Skew for attribute6 is: -0.37529070533

array([<matplotlib.axes._subplots.AxesSubplot object at 0x00000000206FDC50>], dtype=object)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 11 columns):
failure       124494 non-null int64
attribute1    124494 non-null float64
attribute2    124494 non-null int64
attribute3    124494 non-null int64
attribute4    124494 non-null int64
attribute5    124494 non-null int64
attribute6    124494 non-null float64
attribute7    124494 non-null int64
attribute9    124494 non-null int64
daysActive    124494 non-null int64
season        124494 non-null object
dtypes: float64(2), int64(8), object(1)
memory usage: 10.4+ MB

df = pd.get_dummies(df,columns=['season','attribute2','attribute3','attribute4','attribute5','attribute7','attribute9'],drop_first=True)

df.head()

Under-Sampling¶

X = df.drop('failure', axis=1, inplace=False)
y = df.failure

rus = RandomUnderSampler(ratio=1.0,random_state=123)

X, y = rus.fit_sample(X, y)

print X.shape, y.shape

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\deprecation.py:75: DeprecationWarning: Function _ratio_float is deprecated; Use a float for 'ratio' is deprecated from version 0.2. The support will be removed in 0.4. Use a dict, str, or a callable instead.
  warnings.warn(msg, category=DeprecationWarning)

(212L, 873L) (212L,)

Modeling - Regression Algorithms¶

from sklearn.cross_validation import train_test_split as tts

X_train, X_test, y_train, y_test = tts(X, y, train_size=0.8,random_state=123, stratify=y) # Stratify Data to Keep Test Balance

print X_train.shape, X_test.shape, y_train.shape, y_test.shape

(169L, 873L) (43L, 873L) (169L,) (43L,)

Logistic Regression¶

from sklearn.linear_model import LogisticRegression as LR

lr = LR(random_state=123).fit(X_train, y_train)
pred_lr = lr.predict(X_test)

accScore_lr = accuracy_score(y_test, pred_lr)
precisionScore_lr = precision_score(y_test, pred_lr)
recallScore_lr = recall_score(y_test, pred_lr)

print 'Under-Sampling Logistic Regression: \nAccuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.\n'.format(a=accScore_lr, b=precisionScore_lr, c=recallScore_lr)

Under-Sampling Logistic Regression:
Accuracy Score: 0.674418604651.
Precision Score: 0.684210526316.
Recall Score: 0.619047619048.

from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = LR()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Under-Sampling Logistic Regression - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())

TRAIN: [ 43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
 205 206 207 208 209 210 211] TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
 205 206 207 208 209 210 211] TEST: [43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85 128 129 130 131
 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
 204 205 206 207 208 209 210 211] TEST: [ 86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103
 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
 122 123 124 125 126 127]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
 204 205 206 207 208 209 210 211] TEST: [128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 164 165 166 167 168 169]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169] TEST: [170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
 206 207 208 209 210 211]

Under-Sampling Logistic Regression - 5 Folds Testing:
Average Accuracy Score: 0.249723145072.
Average Precision Score: 0.514285714286.
Average Recall Score: 0.106060606061.

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 due to no true samples.
  'recall', 'true', average, warn_for)

from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):

    # This function prints the confusion matrix
    # Normalization can be applied by setting it to true

    plt.imshow(cm, interpolation= 'nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize==True:
        cm = cm.astype(float)/cm.sum(axis=1)[:, np.newaxis]
        print 'Normalized Confusion Matrix'
    else:
        print 'Confusion Matrix without Normalization'

    print cm

    thresh = cm.max()/2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i,j], horizontalalignment='center', color='white' if cm[i,j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')

cnf_matrix = confusion_matrix(y_test, pred_lr)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Logistic Regression - Normalized Confusion Matrix')

plt.show()

Normalized Confusion Matrix
[[ 0.73  0.27]
 [ 0.38  0.62]]

from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_test, pred_lr)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Logistic Regression - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_lr))

<matplotlib.text.Text at 0x230a7080>

from sklearn import metrics

predsLR = lr.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predsLR)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Logistic Regression - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

Ridge Regression¶

from sklearn.linear_model import RidgeClassifier

ridge = RidgeClassifier(random_state=123).fit(X_train, y_train)
pred_ridge = ridge.predict(X_test)

accScore_ridge = accuracy_score(y_test, pred_ridge)
precisionScore_ridge = precision_score(y_test, pred_ridge)
recallScore_ridge = recall_score(y_test, pred_ridge)

print 'Under-Sampling Ridge Regression: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_ridge, b=precisionScore_ridge, c=recallScore_ridge)

Under-Sampling Ridge Regression: Accuracy Score: 0.720930232558.
Precision Score: 0.736842105263.
Recall Score: 0.666666666667.

#from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = RidgeClassifier()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Under-Sampling Ridge Classifier - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())

TRAIN: [ 43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
 205 206 207 208 209 210 211] TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
 205 206 207 208 209 210 211] TEST: [43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85 128 129 130 131
 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
 204 205 206 207 208 209 210 211] TEST: [ 86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103
 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
 122 123 124 125 126 127]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
 204 205 206 207 208 209 210 211] TEST: [128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 164 165 166 167 168 169]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169] TEST: [170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
 206 207 208 209 210 211]

Under-Sampling Ridge Classifier - 5 Folds Testing:
Average Accuracy Score: 0.489147286822.
Average Precision Score: 0.569230769231.
Average Recall Score: 0.195238095238.

cnf_matrix = confusion_matrix(y_test, pred_ridge)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Ridge Classifier - Normalized Confusion Matrix')

plt.show()

Normalized Confusion Matrix
[[ 0.77  0.23]
 [ 0.33  0.67]]

from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_test, pred_ridge)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Ridge Classifier - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_ridge))

<matplotlib.text.Text at 0x21179dd8>

'''from sklearn import metrics

predsRidge = ridge.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predsRidge)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Ridge Classifier - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()'''

"from sklearn import metrics\n\npredsRidge = ridge.predict_proba(X_test)[:,1]\nfpr, tpr, _ =  metrics.roc_curve(y_test, predsRidge)\n\nroc_auc = metrics.auc(fpr, tpr)\n\nplt.title('Ridge Classifier - Receiver Operating Characteristic')\nplt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\nplt.legend(loc = 'lower right')\nplt.plot([0,1], [0,1], 'r--')\nplt.xlim([0,1])\nplt.ylim([0,1])\nplt.ylabel('True Positive Rate')\nplt.xlabel('False Positive Rate')\nplt.show()"

Stochastic Gradient Descent¶

from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(random_state=123).fit(X_train, y_train)
pred_sgd = sgd.predict(X_test)

accScore_sgd = accuracy_score(y_test, pred_sgd)
precisionScore_sgd = precision_score(y_test, pred_sgd)
recallScore_sgd = recall_score(y_test, pred_sgd)

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\linear_model\stochastic_gradient.py:84: FutureWarning: max_iter and tol parameters have been added in <class 'sklearn.linear_model.stochastic_gradient.SGDClassifier'> in 0.19. If both are left unset, they default to max_iter=5 and tol=None. If tol is not None, max_iter defaults to max_iter=1000. From 0.21, default max_iter will be 1000, and default tol will be 1e-3.
  "and default tol will be 1e-3." % type(self), FutureWarning)

print 'Under-Sampling Stochastic Gradient Descent: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_sgd, b=precisionScore_sgd, c=recallScore_sgd)

Under-Sampling Stochastic Gradient Descent: Accuracy Score: 0.511627906977.
Precision Score: 0.5.
Recall Score: 1.0.

#from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = SGDClassifier()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Under-Sampling Stochastic Gradient Descent - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())

TRAIN: [ 43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
 205 206 207 208 209 210 211] TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
 205 206 207 208 209 210 211] TEST: [43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85 128 129 130 131
 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
 204 205 206 207 208 209 210 211] TEST: [ 86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103
 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
 122 123 124 125 126 127]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
 204 205 206 207 208 209 210 211] TEST: [128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 164 165 166 167 168 169]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169] TEST: [170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
 206 207 208 209 210 211]

Under-Sampling Stochastic Gradient Descent - 5 Folds Testing:
Average Accuracy Score: 0.299889258029.
Average Precision Score: 0.0.
Average Recall Score: 0.0.

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\metrics\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 due to no predicted samples.
  'precision', 'predicted', average, warn_for)

cnf_matrix = confusion_matrix(y_test, pred_sgd)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Stochastic Gradient Descent - Normalized Confusion Matrix')

plt.show()

Normalized Confusion Matrix
[[ 0.05  0.95]
 [ 0.    1.  ]]

precision, recall, _ = precision_recall_curve(y_test, pred_sgd)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Stochastic Gradient Descent - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_sgd))

<matplotlib.text.Text at 0x21721c50>

'''predsSGD = sgd.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predSGD)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Stochastic Gradient Descent - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()'''

"predsSGD = sgd.predict_proba(X_test)[:,1]\nfpr, tpr, _ =  metrics.roc_curve(y_test, predSGD)\n\nroc_auc = metrics.auc(fpr, tpr)\n\nplt.title('Stochastic Gradient Descent - Receiver Operating Characteristic')\nplt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\nplt.legend(loc = 'lower right')\nplt.plot([0,1], [0,1], 'r--')\nplt.xlim([0,1])\nplt.ylim([0,1])\nplt.ylabel('True Positive Rate')\nplt.xlabel('False Positive Rate')\nplt.show()"

Support Vector Machine Classifier¶

from sklearn.svm import SVC

svc = SVC(random_state=123, probability=True).fit(X_train, y_train)
pred_svc = svc.predict(X_test)

accScore_svc = accuracy_score(y_test, pred_svc)
precisionScore_svc = precision_score(y_test, pred_svc)
recallScore_svc = recall_score(y_test, pred_svc)

print 'Under-Sampling Support Vector Machine: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_svc, b=precisionScore_svc, c=recallScore_svc)

Under-Sampling Support Vector Machine: Accuracy Score: 0.744186046512.
Precision Score: 0.708333333333.
Recall Score: 0.809523809524.

#from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = SVC()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X[train_index], X[test_index]
    ytrain, ytest = y[train_index], y[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Under-Sampling Support Vector Machine Classifier - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())

TRAIN: [ 43  44  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60
  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77  78
  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
 205 206 207 208 209 210 211] TEST: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  86  87  88  89  90  91  92  93  94  95  96
  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114
 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150
 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168
 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186
 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204
 205 206 207 208 209 210 211] TEST: [43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85 128 129 130 131
 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149
 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
 204 205 206 207 208 209 210 211] TEST: [ 86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103
 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
 122 123 124 125 126 127]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185
 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
 204 205 206 207 208 209 210 211] TEST: [128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
 164 165 166 167 168 169]

TRAIN: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169] TEST: [170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205
 206 207 208 209 210 211]

Under-Sampling Support Vector Machine Classifier - 5 Folds Testing:
Average Accuracy Score: 0.1642303433.
Average Precision Score: 0.2.
Average Recall Score: 0.00952380952381.

cnf_matrix = confusion_matrix(y_test, pred_svc)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Support Vector Machine Classifier - Normalized Confusion Matrix')

plt.show()

Normalized Confusion Matrix
[[ 0.68  0.32]
 [ 0.19  0.81]]

precision, recall, _ = precision_recall_curve(y_test, pred_svc)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Support Vector Machine Classifier - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_sgd))

<matplotlib.text.Text at 0x2157f550>

predsSVC = svc.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predsSVC)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Support Vector Machine Classifier - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

	failure	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute7	attribute8	attribute9
failure	1.000000	0.001984	0.052902	-0.000948	0.067398	0.002270	-0.000550	0.119055	0.119055	0.001622
attribute1	0.001984	1.000000	-0.004248	0.003702	0.001837	-0.003370	-0.001516	0.000151	0.000151	0.001122
attribute2	0.052902	-0.004248	1.000000	-0.002617	0.146593	-0.013999	-0.026350	0.141367	0.141367	-0.002736
attribute3	-0.000948	0.003702	-0.002617	1.000000	0.097452	-0.006696	0.009027	-0.001884	-0.001884	0.532366
attribute4	0.067398	0.001837	0.146593	0.097452	1.000000	-0.009773	0.024870	0.045631	0.045631	0.036069
attribute5	0.002270	-0.003370	-0.013999	-0.006696	-0.009773	1.000000	-0.017051	-0.009384	-0.009384	0.005949
attribute6	-0.000550	-0.001516	-0.026350	0.009027	0.024870	-0.017051	1.000000	-0.012207	-0.012207	0.021152
attribute7	0.119055	0.000151	0.141367	-0.001884	0.045631	-0.009384	-0.012207	1.000000	1.000000	0.006861
attribute8	0.119055	0.000151	0.141367	-0.001884	0.045631	-0.009384	-0.012207	1.000000	1.000000	0.006861
attribute9	0.001622	0.001122	-0.002736	0.532366	0.036069	0.005949	0.021152	0.006861	0.006861	1.000000

	failure	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute7	attribute8	attribute9
count	124494.000000	1.244940e+05	124494.000000	124494.000000	124494.000000	124494.000000	124494.000000	124494.000000	124494.000000	124494.000000
mean	0.000851	1.223868e+08	159.484762	9.940455	1.741120	14.222693	260172.858025	0.292528	0.292528	12.451524
std	0.029167	7.045960e+07	2179.657730	185.747321	22.908507	15.943021	99151.009852	7.436924	7.436924	191.425623
min	0.000000	0.000000e+00	0.000000	0.000000	0.000000	1.000000	8.000000	0.000000	0.000000	0.000000
25%	0.000000	6.127675e+07	0.000000	0.000000	0.000000	8.000000	221452.000000	0.000000	0.000000	0.000000
50%	0.000000	1.227957e+08	0.000000	0.000000	0.000000	10.000000	249799.500000	0.000000	0.000000	0.000000
75%	0.000000	1.833084e+08	0.000000	0.000000	0.000000	12.000000	310266.000000	0.000000	0.000000	0.000000
max	1.000000	2.441405e+08	64968.000000	24929.000000	1666.000000	98.000000	689161.000000	832.000000	832.000000	18701.000000

	date	device	failure	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute7	attribute9
4885	2015-01-05	S1F0RRB1	1	48467332	64776	0	841	8	39267	56	1
6879	2015-01-07	S1F0CTDN	1	184069720	528	0	4	9	387871	32	3
8823	2015-01-09	W1F0PNA5	1	136429411	64784	0	406	30	224801	8	0
11957	2015-01-13	W1F13SRV	1	188251248	2040	0	0	6	39345	32	1
12668	2015-01-14	W1F1230J	1	220461296	0	0	0	14	325125	0	0
14734	2015-01-17	W1F0T034	1	154998752	1312	0	18	10	306996	0	0
15087	2015-01-18	S1F0GG8X	1	54292264	64736	0	160	11	192179	0	2
15773	2015-01-19	S1F023H2	1	64499464	0	0	1	19	514661	16	3
15859	2015-01-19	S1F0QY11	1	159635352	0	0	9	7	231336	16	0
15898	2015-01-19	S1F0S2WJ	1	238299872	1184	0	0	14	268662	8	0
16223	2015-01-19	W1F0Z1W9	1	77877592	0	0	0	12	233238	0	0
16257	2015-01-19	W1F15S4D	1	241813024	3528	0	74	6	106159	16	0
16403	2015-01-19	Z1F0LVPW	1	120878440	304	1	34	7	298439	40	52
16420	2015-01-19	Z1F0NVZA	1	21389544	0	0	60	10	209721	16	0
16475	2015-01-19	Z1F1FCH5	1	163342800	0	0	6	7	239297	24	0
16554	2015-01-20	S1F0P3G2	1	178102642	10288	0	48	8	261020	424	1
18231	2015-01-22	W1F0F6BN	1	22646880	0	0	58	12	244253	0	0
18991	2015-01-23	W1F0P114	1	192865952	0	0	9	36	271129	0	0
19059	2015-01-23	W1F0X4FC	1	64408168	0	0	0	7	245849	48	0
20817	2015-01-26	S1F0LCTV	1	47001646	0	0	7	12	468887	0	0

	attribute1	attribute6	season_winter	...
0	0.883224	0.591204	1	...
1	0.251374	0.585017	1	...
2	0.709821	0.344461	1	...
3	0.326427	0.595191	1	...
4	0.556935	0.454420	1	...

	date	device	attribute1	attribute2	attribute3	attribute4	attribute5	attribute6	attribute9
0	2015-01-01	S1F01085	215630672	56	0	52	6	407438	7
1	2015-01-01	S1F0166B	61370680	0	3	0	6	403174	0
2	2015-01-01	S1F01E6Y	173295968	0	0	0	12	237394	0
3	2015-01-01	S1F01JE0	79694024	0	0	0	6	410186	0
4	2015-01-01	S1F01R2B	135970480	0	0	0	15	313173	3

Device Failures - Under-Sampling Regressor Algorithms