Device Failures - Over-Sampling Regressor Algorithms

Project 5 - Device Failures (Imbalanced Data) - Regression - Over Sampling

Project - Device Failures (Over-Sampling Regression)

Purpose/Goal:

To use the given data to create a classifcation algorithm that will accurately predict if a device will fail or not.

In this notebook, I manipulate the data for traditionally regression algorithms. I also use over-sampling due to the imbalance of device failures.

In [1]:
import numpy as np
from sklearn.cross_validation import train_test_split as tts
from sklearn import linear_model
import pandas as pd
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import recall_score, precision_score, accuracy_score
from sklearn.feature_selection import SelectFromModel
import seaborn as sns
from datetime import datetime
%matplotlib inline
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [2]:
df = pd.read_csv('failures.csv')
In [3]:
df.head()
Out[3]:
date device failure attribute1 attribute2 attribute3 attribute4 attribute5 attribute6 attribute7 attribute8 attribute9
0 2015-01-01 S1F01085 0 215630672 56 0 52 6 407438 0 0 7
1 2015-01-01 S1F0166B 0 61370680 0 3 0 6 403174 0 0 0
2 2015-01-01 S1F01E6Y 0 173295968 0 0 0 12 237394 0 0 0
3 2015-01-01 S1F01JE0 0 79694024 0 0 0 6 410186 0 0 0
4 2015-01-01 S1F01R2B 0 135970480 0 0 0 15 313173 0 0 3
In [4]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 12 columns):
date          124494 non-null object
device        124494 non-null object
failure       124494 non-null int64
attribute1    124494 non-null int64
attribute2    124494 non-null int64
attribute3    124494 non-null int64
attribute4    124494 non-null int64
attribute5    124494 non-null int64
attribute6    124494 non-null int64
attribute7    124494 non-null int64
attribute8    124494 non-null int64
attribute9    124494 non-null int64
dtypes: int64(10), object(2)
memory usage: 11.4+ MB
In [5]:
df.corr()
Out[5]:
failure attribute1 attribute2 attribute3 attribute4 attribute5 attribute6 attribute7 attribute8 attribute9
failure 1.000000 0.001984 0.052902 -0.000948 0.067398 0.002270 -0.000550 0.119055 0.119055 0.001622
attribute1 0.001984 1.000000 -0.004248 0.003702 0.001837 -0.003370 -0.001516 0.000151 0.000151 0.001122
attribute2 0.052902 -0.004248 1.000000 -0.002617 0.146593 -0.013999 -0.026350 0.141367 0.141367 -0.002736
attribute3 -0.000948 0.003702 -0.002617 1.000000 0.097452 -0.006696 0.009027 -0.001884 -0.001884 0.532366
attribute4 0.067398 0.001837 0.146593 0.097452 1.000000 -0.009773 0.024870 0.045631 0.045631 0.036069
attribute5 0.002270 -0.003370 -0.013999 -0.006696 -0.009773 1.000000 -0.017051 -0.009384 -0.009384 0.005949
attribute6 -0.000550 -0.001516 -0.026350 0.009027 0.024870 -0.017051 1.000000 -0.012207 -0.012207 0.021152
attribute7 0.119055 0.000151 0.141367 -0.001884 0.045631 -0.009384 -0.012207 1.000000 1.000000 0.006861
attribute8 0.119055 0.000151 0.141367 -0.001884 0.045631 -0.009384 -0.012207 1.000000 1.000000 0.006861
attribute9 0.001622 0.001122 -0.002736 0.532366 0.036069 0.005949 0.021152 0.006861 0.006861 1.000000
In [6]:
corr = df.corr()

fig, ax = plt.subplots(figsize=(10,10))
sns.heatmap(corr, xticklabels=corr.columns.values, yticklabels=corr.columns.values, ax=ax)
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x1fbf8d68>
In [7]:
df.describe()
Out[7]:
failure attribute1 attribute2 attribute3 attribute4 attribute5 attribute6 attribute7 attribute8 attribute9
count 124494.000000 1.244940e+05 124494.000000 124494.000000 124494.000000 124494.000000 124494.000000 124494.000000 124494.000000 124494.000000
mean 0.000851 1.223868e+08 159.484762 9.940455 1.741120 14.222693 260172.858025 0.292528 0.292528 12.451524
std 0.029167 7.045960e+07 2179.657730 185.747321 22.908507 15.943021 99151.009852 7.436924 7.436924 191.425623
min 0.000000 0.000000e+00 0.000000 0.000000 0.000000 1.000000 8.000000 0.000000 0.000000 0.000000
25% 0.000000 6.127675e+07 0.000000 0.000000 0.000000 8.000000 221452.000000 0.000000 0.000000 0.000000
50% 0.000000 1.227957e+08 0.000000 0.000000 0.000000 10.000000 249799.500000 0.000000 0.000000 0.000000
75% 0.000000 1.833084e+08 0.000000 0.000000 0.000000 12.000000 310266.000000 0.000000 0.000000 0.000000
max 1.000000 2.441405e+08 64968.000000 24929.000000 1666.000000 98.000000 689161.000000 832.000000 832.000000 18701.000000
In [8]:
df.nunique()
# Columns could be categorical data
Out[8]:
date             304
device          1168
failure            2
attribute1    123878
attribute2       558
attribute3        47
attribute4       115
attribute5        60
attribute6     44838
attribute7        28
attribute8        28
attribute9        65
dtype: int64
In [9]:
fig = plt.figure(figsize=(7,10))

ax1 = plt.subplot(331)
ax2 = plt.subplot(332)
ax3 = plt.subplot(333)
ax4 = plt.subplot(334)
ax5 = plt.subplot(335)
ax6 = plt.subplot(336)
ax7 = plt.subplot(337)
ax8 = plt.subplot(338)
ax9 = plt.subplot(339)

df.boxplot(column='attribute1',by='failure',ax=ax1)
df.boxplot(column='attribute2',by='failure',ax=ax2)
df.boxplot(column='attribute3',by='failure',ax=ax3)
df.boxplot(column='attribute4',by='failure',ax=ax4)
df.boxplot(column='attribute5',by='failure',ax=ax5)
df.boxplot(column='attribute6',by='failure',ax=ax6)
df.boxplot(column='attribute7',by='failure',ax=ax7)
df.boxplot(column='attribute8',by='failure',ax=ax8)
df.boxplot(column='attribute9',by='failure',ax=ax9)

plt.suptitle('')
plt.tight_layout()
In [10]:
df.attribute1.value_counts()

# Continuous variables create problems for Tree Based Algorithms. Creates pure tables too quickly
Out[10]:
165048912    26
57192360     26
89196552     26
169490248    23
165040624    15
57180136     15
89162648     15
12194976     15
169467344    15
165045144    13
89179832     13
0            11
57184544      7
169474944     7
57187976      6
169477432     6
202325472     5
120526024     4
170202672     4
188259592     4
51974040      4
185269928     4
100633152     4
159379088     4
5702496       4
49629347      4
162347792     4
148434224     4
120570664     4
179373176     4
             ..
240315144     1
135395304     1
148289536     1
140383496     1
238793608     1
5140408       1
236795888     1
142033792     1
189816000     1
127402656     1
175656336     1
6328824       1
147421176     1
59359680      1
131723320     1
112735064     1
130283560     1
87835960      1
12777480      1
67957256      1
116992624     1
153237520     1
17206176      1
64124664      1
4356120       1
137755216     1
207337504     1
36671416      1
206450608     1
81160776      1
Name: attribute1, Length: 123878, dtype: int64
In [11]:
# Removing Duplicates

print df.duplicated().sum() # No duplicated rows

df.drop('attribute8',axis=1,inplace=True) #attribute 8 is a duplicate of attribute7
df.head()
0
Out[11]:
date device failure attribute1 attribute2 attribute3 attribute4 attribute5 attribute6 attribute7 attribute9
0 2015-01-01 S1F01085 0 215630672 56 0 52 6 407438 0 7
1 2015-01-01 S1F0166B 0 61370680 0 3 0 6 403174 0 0
2 2015-01-01 S1F01E6Y 0 173295968 0 0 0 12 237394 0 0
3 2015-01-01 S1F01JE0 0 79694024 0 0 0 6 410186 0 0
4 2015-01-01 S1F01R2B 0 135970480 0 0 0 15 313173 0 3
In [12]:
# Have an Imbalanced Data Set

numfailure0 = len(df[df.failure==0]) #len(df.failure==0)
numfailure1 = len(df[df.failure==1])
total = numfailure0 + numfailure1

print numfailure0, numfailure1, total
df.failure.value_counts(normalize=True)
124388 106 124494
Out[12]:
0    0.999149
1    0.000851
Name: failure, dtype: float64
In [13]:
print df.date.nunique(), df.device.nunique()
print df.date.nunique()*df.device.nunique()/float(total)

df.date.value_counts()

# Does not appear to be an obvious pattern between 
304 1168
2.85212138738
Out[13]:
2015-01-01    1163
2015-01-02    1163
2015-01-03    1163
2015-01-04    1162
2015-01-05    1161
2015-01-06    1054
2015-01-07     798
2015-01-09     756
2015-01-08     756
2015-01-10     755
2015-01-11     755
2015-01-12     755
2015-01-13     755
2015-01-14     716
2015-01-15     715
2015-01-17     715
2015-01-16     715
2015-01-29     715
2015-01-18     714
2015-01-20     713
2015-01-19     713
2015-02-08     712
2015-02-09     712
2015-02-07     712
2015-02-04     712
2015-02-05     712
2015-02-02     712
2015-02-03     712
2015-02-01     712
2015-02-14     712
              ...
2015-09-05     146
2015-09-06     146
2015-09-07     146
2015-10-06     141
2015-10-09     141
2015-10-08     141
2015-10-07     141
2015-10-12     140
2015-10-10     140
2015-10-11     140
2015-09-03     115
2015-10-13     111
2015-10-14     111
2015-10-17     109
2015-10-15     109
2015-10-19     109
2015-10-16     109
2015-10-18     109
2015-10-21      69
2015-10-22      69
2015-10-20      69
2015-10-26      32
2015-10-25      32
2015-10-24      32
2015-10-23      32
2015-10-29      31
2015-10-31      31
2015-10-30      31
2015-10-27      31
2015-11-02      31
Name: date, Length: 304, dtype: int64
In [14]:
df[df.failure==1].head(20)
Out[14]:
date device failure attribute1 attribute2 attribute3 attribute4 attribute5 attribute6 attribute7 attribute9
4885 2015-01-05 S1F0RRB1 1 48467332 64776 0 841 8 39267 56 1
6879 2015-01-07 S1F0CTDN 1 184069720 528 0 4 9 387871 32 3
8823 2015-01-09 W1F0PNA5 1 136429411 64784 0 406 30 224801 8 0
11957 2015-01-13 W1F13SRV 1 188251248 2040 0 0 6 39345 32 1
12668 2015-01-14 W1F1230J 1 220461296 0 0 0 14 325125 0 0
14734 2015-01-17 W1F0T034 1 154998752 1312 0 18 10 306996 0 0
15087 2015-01-18 S1F0GG8X 1 54292264 64736 0 160 11 192179 0 2
15773 2015-01-19 S1F023H2 1 64499464 0 0 1 19 514661 16 3
15859 2015-01-19 S1F0QY11 1 159635352 0 0 9 7 231336 16 0
15898 2015-01-19 S1F0S2WJ 1 238299872 1184 0 0 14 268662 8 0
16223 2015-01-19 W1F0Z1W9 1 77877592 0 0 0 12 233238 0 0
16257 2015-01-19 W1F15S4D 1 241813024 3528 0 74 6 106159 16 0
16403 2015-01-19 Z1F0LVPW 1 120878440 304 1 34 7 298439 40 52
16420 2015-01-19 Z1F0NVZA 1 21389544 0 0 60 10 209721 16 0
16475 2015-01-19 Z1F1FCH5 1 163342800 0 0 6 7 239297 24 0
16554 2015-01-20 S1F0P3G2 1 178102642 10288 0 48 8 261020 424 1
18231 2015-01-22 W1F0F6BN 1 22646880 0 0 58 12 244253 0 0
18991 2015-01-23 W1F0P114 1 192865952 0 0 9 36 271129 0 0
19059 2015-01-23 W1F0X4FC 1 64408168 0 0 0 7 245849 48 0
20817 2015-01-26 S1F0LCTV 1 47001646 0 0 7 12 468887 0 0
In [15]:
# It appears measurement are taken until the device fails, since the last measurement is the failure

print df[df.failure==1].device.duplicated().sum()
print df[df.device=='S1F0P3G2']
print df[df.device=='W1F0P114']
0
             date    device  failure  attribute1  attribute2  attribute3  \
143    2015-01-01  S1F0P3G2        0    48342824           0           0
1306   2015-01-02  S1F0P3G2        0    67760976           0           0
2469   2015-01-03  S1F0P3G2        0    90052720           0           0
3632   2015-01-04  S1F0P3G2        0   111839288           0           0
4793   2015-01-05  S1F0P3G2        0   136067776           0           0
5918   2015-01-06  S1F0P3G2        0   156791032           0           0
6936   2015-01-07  S1F0P3G2        0   181020736           0           0
7733   2015-01-08  S1F0P3G2        0   201764136           0           0
8489   2015-01-09  S1F0P3G2        0   228380344           0           0
9245   2015-01-10  S1F0P3G2        0     6166904           0           0
10000  2015-01-11  S1F0P3G2        0    28846072           0           0
10755  2015-01-12  S1F0P3G2        0    49137600           0           0
11510  2015-01-13  S1F0P3G2        0    75091544           0           0
12265  2015-01-14  S1F0P3G2        0    94906992           0           0
12981  2015-01-15  S1F0P3G2        0   114669384           0           0
13696  2015-01-16  S1F0P3G2        0   140000456           0           0
14411  2015-01-17  S1F0P3G2        0   119162104           0           0
15126  2015-01-18  S1F0P3G2        0   200708872         328           0
15839  2015-01-19  S1F0P3G2        0    45538552        2848           0
16554  2015-01-20  S1F0P3G2        1   178102642       10288           0

       attribute4  attribute5  attribute6  attribute7  attribute9
143             0           8      241097           0           1
1306            0           8      242452           0           1
2469            0           8      243829           0           1
3632            0           8      245067           0           1
4793            0           8      246390           0           1
5918            0           8      247802           0           1
6936            0           8      249129           0           1
7733            0           8      250461           0           1
8489            0           8      251675           0           1
9245            0           8      253036           0           1
10000           0           8      254357           0           1
10755           0           8      255693           0           1
11510           0           8      256979           0           1
12265           0           8      258306           0           1
12981           0           8      259648           0           1
13696           0           8      260926           0           1
14411          12           8      261020           0           1
15126          19           8      261020           0           1
15839          29           8      261020         136           1
16554          48           8      261020         424           1
             date    device  failure  attribute1  attribute2  attribute3  \
636    2015-01-01  W1F0P114        0    59684104           0           0
1799   2015-01-02  W1F0P114        0    84034728           0           0
2962   2015-01-03  W1F0P114        0   105642752           0           0
4125   2015-01-04  W1F0P114        0   128775712           0           0
5286   2015-01-05  W1F0P114        0   150367752           0           0
6392   2015-01-06  W1F0P114        0   173205864           0           0
7293   2015-01-07  W1F0P114        0   196586856           0           0
8057   2015-01-08  W1F0P114        0   217842552           0           0
8813   2015-01-09  W1F0P114        0   240175208           0           0
9569   2015-01-10  W1F0P114        0    82812176           0           0
10324  2015-01-11  W1F0P114        0     4354704           0           0
11079  2015-01-12  W1F0P114        0    27727272           0           0
11834  2015-01-13  W1F0P114        0   211767464           0           0
12565  2015-01-14  W1F0P114        0   183133992           0           0
13281  2015-01-15  W1F0P114        0   140731048           0           0
13996  2015-01-16  W1F0P114        0    68931064           0           0
14711  2015-01-17  W1F0P114        0   150261624           0           0
15426  2015-01-18  W1F0P114        0   167907000           0           0
16139  2015-01-19  W1F0P114        0   191519505           0           0
16855  2015-01-20  W1F0P114        0   215121217           0           0
17567  2015-01-21  W1F0P114        0     3207048           0           0
18279  2015-01-22  W1F0P114        0   109931016           0           0
18991  2015-01-23  W1F0P114        1   192865952           0           0

       attribute4  attribute5  attribute6  attribute7  attribute9
636             6          35      248464           0           0
1799            6          35      249831           0           0
2962            6          35      251159           0           0
4125            6          35      252484           0           0
5286            6          35      253790           0           0
6392            6          35      255106           0           0
7293            6          35      256482           0           0
8057            6          35      257807           0           0
8813            6          35      259118           0           0
9569            6          36      259747           0           0
10324           7          36      260106           0           0
11079           7          36      261447           0           0
11834           7          36      262654           0           0
12565           7          36      263384           0           0
13281           7          36      263384           0           0
13996           7          36      263384           0           0
14711           8          36      264721           0           0
15426           8          36      266076           0           0
16139           9          36      267404           0           0
16855           9          36      268734           0           0
17567           9          36      270117           0           0
18279           9          36      270678           0           0
18991           9          36      271129           0           0
In [16]:
# We must assume all devices are created equal. Thus a combination of the attibutes and maybe 'time' attibutes to failure
# Creating a column to measure time called 'daysActive

df.date = pd.to_datetime(df.date)
df['daysActive'] = (df.date-df.date[0])
df.daysActive = df.daysActive.dt.days
df.head()
Out[16]:
date device failure attribute1 attribute2 attribute3 attribute4 attribute5 attribute6 attribute7 attribute9 daysActive
0 2015-01-01 S1F01085 0 215630672 56 0 52 6 407438 0 7 0
1 2015-01-01 S1F0166B 0 61370680 0 3 0 6 403174 0 0 0
2 2015-01-01 S1F01E6Y 0 173295968 0 0 0 12 237394 0 0 0
3 2015-01-01 S1F01JE0 0 79694024 0 0 0 6 410186 0 0 0
4 2015-01-01 S1F01R2B 0 135970480 0 0 0 15 313173 0 3 0
In [17]:
# Creating a feature called 'season'
# Using Northen Hemisphere Seasonality

season = []

for date in df.date:
    if date < datetime(2015,3,1):
        season.append('winter')
    elif date < datetime(2015,6,1):
        season.append('spring')
    elif date  < datetime(2015,9,1):
        season.append('summer')
    else:
        season.append('fall')

df['season'] = season

df.head()
Out[17]:
date device failure attribute1 attribute2 attribute3 attribute4 attribute5 attribute6 attribute7 attribute9 daysActive season
0 2015-01-01 S1F01085 0 215630672 56 0 52 6 407438 0 7 0 winter
1 2015-01-01 S1F0166B 0 61370680 0 3 0 6 403174 0 0 0 winter
2 2015-01-01 S1F01E6Y 0 173295968 0 0 0 12 237394 0 0 0 winter
3 2015-01-01 S1F01JE0 0 79694024 0 0 0 6 410186 0 0 0 winter
4 2015-01-01 S1F01R2B 0 135970480 0 0 0 15 313173 0 3 0 winter
In [18]:
df.drop(['date','device'],axis=1,inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 11 columns):
failure       124494 non-null int64
attribute1    124494 non-null int64
attribute2    124494 non-null int64
attribute3    124494 non-null int64
attribute4    124494 non-null int64
attribute5    124494 non-null int64
attribute6    124494 non-null int64
attribute7    124494 non-null int64
attribute9    124494 non-null int64
daysActive    124494 non-null int64
season        124494 non-null object
dtypes: int64(10), object(1)
memory usage: 10.4+ MB

Cleaning Data for Regression Algorithms

In [19]:
# Normalize Attributes 1 & 6

attribute1max = float(df.attribute1.max())
attribute1min = float(df.attribute1.min())
attribute6max = float(df.attribute6.max())
attribute6min = float(df.attribute6.min())

df.attribute1 = df.attribute1.apply(lambda x: (x - attribute1min)/(attribute1max - attribute1min))
df.attribute6 = df.attribute6.apply(lambda x: (x - attribute6min)/(attribute6max - attribute6min))
In [20]:
print 'Skew for {a} is: {b}'.format(a='attribute1', b=df.attribute1.skew())
print 'Skew for {a} is: {b}'.format(a='attribute6', b=df.attribute6.skew())

fig = plt.figure(figsize=(10,10))

ax1 = plt.subplot(121)
ax2 = plt.subplot(122)

df.hist(column='attribute1',ax=ax1)
df.hist(column='attribute6',ax=ax2)
Skew for attribute1 is: -0.0111148073703
Skew for attribute6 is: -0.37529070533
Out[20]:
array([<matplotlib.axes._subplots.AxesSubplot object at 0x0000000020322D68>], dtype=object)
In [21]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 124494 entries, 0 to 124493
Data columns (total 11 columns):
failure       124494 non-null int64
attribute1    124494 non-null float64
attribute2    124494 non-null int64
attribute3    124494 non-null int64
attribute4    124494 non-null int64
attribute5    124494 non-null int64
attribute6    124494 non-null float64
attribute7    124494 non-null int64
attribute9    124494 non-null int64
daysActive    124494 non-null int64
season        124494 non-null object
dtypes: float64(2), int64(8), object(1)
memory usage: 10.4+ MB
In [22]:
df = pd.get_dummies(df,columns=['season','attribute2','attribute3','attribute4','attribute5','attribute7','attribute9'],drop_first=True)
In [23]:
df.head()
Out[23]:
failure attribute1 attribute6 daysActive season_spring season_summer season_winter attribute2_8 attribute2_16 attribute2_24 ... attribute9_1165 attribute9_1864 attribute9_2269 attribute9_2270 attribute9_2522 attribute9_2637 attribute9_2794 attribute9_7226 attribute9_10137 attribute9_18701
0 0 0.883224 0.591204 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
1 0 0.251374 0.585017 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
2 0 0.709821 0.344461 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
3 0 0.326427 0.595191 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0
4 0 0.556935 0.454420 0 0 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 874 columns

Modeling - Regression Algorithms

In [24]:
X = df.drop('failure', axis=1, inplace=False)
y = df.failure

print X.shape, y.shape
(124494, 873) (124494L,)
In [25]:
from sklearn.cross_validation import train_test_split as tts

X_train, X_test, y_train, y_test = tts(X, y, train_size=0.8,random_state=123, stratify=y) # Stratify Data to Keep Test Balance

print X_train.shape, X_test.shape, y_train.shape, y_test.shape
(99595, 873) (24899, 873) (99595L,) (24899L,)

Over-Sampling

SMOTE

In [26]:
sm = SMOTE(random_state=123, ratio=1.0)

X_train_smote, y_train_smote = sm.fit_sample(X_train, y_train)

print X_train_smote.shape, y_train_smote.shape
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\deprecation.py:75: DeprecationWarning: Function _ratio_float is deprecated; Use a float for 'ratio' is deprecated from version 0.2. The support will be removed in 0.4. Use a dict, str, or a callable instead.
  warnings.warn(msg, category=DeprecationWarning)
(199020L, 873L) (199020L,)

SMOTEENN

In [27]:
sm = SMOTEENN(random_state=123, ratio=1.0)

X_train_smoteenn, y_train_smoteenn = sm.fit_sample(X_train, y_train)

print X_train_smoteenn.shape, y_train_smoteenn.shape
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\deprecation.py:75: DeprecationWarning: Function _ratio_float is deprecated; Use a float for 'ratio' is deprecated from version 0.2. The support will be removed in 0.4. Use a dict, str, or a callable instead.
  warnings.warn(msg, category=DeprecationWarning)
(197432L, 873L) (197432L,)

Logistic Regression (SMOTE)

In [28]:
from sklearn.linear_model import LogisticRegression as LR
In [29]:
lr_smote = LR(random_state=123).fit(X_train_smote, y_train_smote)
pred_lr_smote = lr_smote.predict(X_test)

accScore_lr_smote = accuracy_score(y_test, pred_lr_smote)
precisionScore_lr_smote = precision_score(y_test, pred_lr_smote)
recallScore_lr_smote = recall_score(y_test, pred_lr_smote)

print 'Over-Sampling (SMOTE) Logistic Regression: \nAccuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.\n'.format(a=accScore_lr_smote, b=precisionScore_lr_smote, c=recallScore_lr_smote)
Over-Sampling (SMOTE) Logistic Regression:
Accuracy Score: 0.971243825053.
Precision Score: 0.00286123032904.
Recall Score: 0.0952380952381.

In [44]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = LR()

for train_index, test_index in kf.split(X_train_smote):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X_train_smote[train_index], X_train_smote[test_index]
    ytrain, ytest = y_train_smote[train_index], y_train_smote[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Over-Sampling (SMOTE) Logistic Regression - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
TRAIN: [ 39804  39805  39806 ..., 199017 199018 199019] TEST: [    0     1     2 ..., 39801 39802 39803]

TRAIN: [     0      1      2 ..., 199017 199018 199019] TEST: [39804 39805 39806 ..., 79605 79606 79607]

TRAIN: [     0      1      2 ..., 199017 199018 199019] TEST: [ 79608  79609  79610 ..., 119409 119410 119411]

TRAIN: [     0      1      2 ..., 199017 199018 199019] TEST: [119412 119413 119414 ..., 159213 159214 159215]

TRAIN: [     0      1      2 ..., 159213 159214 159215] TEST: [159216 159217 159218 ..., 199017 199018 199019]

Over-Sampling (SMOTE) Logistic Regression - 5 Folds Testing:
Average Accuracy Score: 0.938684554316.
Average Precision Score: 0.600051390954.
Average Recall Score: 0.908635030176.

In [45]:
from sklearn.metrics import confusion_matrix
import itertools

def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):

    # This function prints the confusion matrix
    # Normalization can be applied by setting it to true

    plt.imshow(cm, interpolation= 'nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=0)
    plt.yticks(tick_marks, classes)

    if normalize==True:
        cm = cm.astype(float)/cm.sum(axis=1)[:, np.newaxis]
        print 'Normalized Confusion Matrix'
    else:
        print 'Confusion Matrix without Normalization'

    print cm

    thresh = cm.max()/2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i,j], horizontalalignment='center', color='white' if cm[i,j] > thresh else 'black')

    plt.tight_layout()
    plt.ylabel('True Label')
    plt.xlabel('Predicted Label')
In [46]:
cnf_matrix = confusion_matrix(y_test, pred_lr_smote)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Logistic Regression (SMOTE) - Normalized Confusion Matrix')

plt.show()
Normalized Confusion Matrix
[[ 0.97  0.03]
 [ 0.9   0.1 ]]
In [47]:
from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_test, pred_lr_smote)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Logistic Regression (SMOTE) - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_lr_smote))
Out[47]:
<matplotlib.text.Text at 0x22903908>
In [48]:
from sklearn import metrics

predsLR_smote = lr_smote.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predsLR_smote)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Logistic Regression (SMOTE) - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

Logistic Regression (SMOTEENN)

In [49]:
from sklearn.linear_model import LogisticRegression as LR
In [50]:
lr_smoteenn = LR(random_state=123).fit(X_train_smoteenn, y_train_smoteenn)
pred_lr_smoteenn = lr_smoteenn.predict(X_test)

accScore_lr_smoteenn = accuracy_score(y_test, pred_lr_smoteenn)
precisionScore_lr_smoteenn = precision_score(y_test, pred_lr_smoteenn)
recallScore_lr_smoteenn = recall_score(y_test, pred_lr_smoteenn)

print 'Over-Sampling (SMOTEENN) Logistic Regression: \nAccuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.\n'.format(a=accScore_lr_smoteenn, b=precisionScore_lr_smoteenn, c=recallScore_lr_smoteenn)
Over-Sampling (SMOTEENN) Logistic Regression:
Accuracy Score: 0.970802040243.
Precision Score: 0.00281690140845.
Recall Score: 0.0952380952381.

In [57]:
from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = LR()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X_train_smoteenn[train_index], X_train_smoteenn[test_index]
    ytrain, ytest = y_train_smoteenn[train_index], y_train_smoteenn[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Over-Sampling (SMOTEENN) Logistic Regression - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
TRAIN: [ 24899  24900  24901 ..., 124491 124492 124493] TEST: [    0     1     2 ..., 24896 24897 24898]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [24899 24900 24901 ..., 49795 49796 49797]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [49798 49799 49800 ..., 74694 74695 74696]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [74697 74698 74699 ..., 99593 99594 99595]

TRAIN: [    0     1     2 ..., 99593 99594 99595] TEST: [ 99596  99597  99598 ..., 124491 124492 124493]

Over-Sampling (SMOTEENN) Logistic Regression - 5 Folds Testing:
Average Accuracy Score: 0.898410020212.
Average Precision Score: 0.364145658263.
Average Recall Score: 0.286437365829.

In [58]:
cnf_matrix = confusion_matrix(y_test, pred_lr_smoteenn)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Logistic Regression (SMTOEENN) - Normalized Confusion Matrix')

plt.show()
Normalized Confusion Matrix
[[ 0.97  0.03]
 [ 0.9   0.1 ]]
In [59]:
from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_test, pred_lr_smoteenn)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Logistic Regression (SMOTEENN) - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_lr_smoteenn))
Out[59]:
<matplotlib.text.Text at 0x21128080>
In [60]:
from sklearn import metrics

predsLR_smoteenn = lr_smoteenn.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predsLR_smoteenn)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Logistic Regression (SMOTE) - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

Ridge Regression (SMOTE)

In [61]:
from sklearn.linear_model import RidgeClassifier
In [62]:
ridge_smote = RidgeClassifier(random_state=123).fit(X_train_smote, y_train_smote)
pred_ridge_smote = ridge_smote.predict(X_test)

accScore_ridge_smote = accuracy_score(y_test, pred_ridge_smote)
precisionScore_ridge_smote = precision_score(y_test, pred_ridge_smote)
recallScore_ridge_smote = recall_score(y_test, pred_ridge_smote)

print 'Over-Sampling (SMTOE) Ridge Regression: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_ridge_smote, b=precisionScore_ridge_smote, c=recallScore_ridge_smote)
Over-Sampling (SMTOE) Ridge Regression: Accuracy Score: 0.973533073617.
Precision Score: 0.00465838509317.
Recall Score: 0.142857142857.
In [63]:
#from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = RidgeClassifier()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X_train_smote[train_index], X_train_smote[test_index]
    ytrain, ytest = y_train_smote[train_index], y_train_smote[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Over-Sampling (SMOTE) Ridge Classifier - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
TRAIN: [ 24899  24900  24901 ..., 124491 124492 124493] TEST: [    0     1     2 ..., 24896 24897 24898]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [24899 24900 24901 ..., 49795 49796 49797]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [49798 49799 49800 ..., 74694 74695 74696]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [74697 74698 74699 ..., 99593 99594 99595]

TRAIN: [    0     1     2 ..., 99593 99594 99595] TEST: [ 99596  99597  99598 ..., 124491 124492 124493]

Over-Sampling (SMOTE) Ridge Classifier - 5 Folds Testing:
Average Accuracy Score: 0.798137043662.
Average Precision Score: 0.227918454314.
Average Recall Score: 0.685125795183.

In [64]:
cnf_matrix = confusion_matrix(y_test, pred_ridge_smote)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Ridge Classifier - Normalized Confusion Matrix')

plt.show()
Normalized Confusion Matrix
[[ 0.97  0.03]
 [ 0.86  0.14]]
In [65]:
from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_test, pred_ridge_smote)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Ridge Classifier - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_ridge_smote))
Out[65]:
<matplotlib.text.Text at 0x4346ff60>
In [66]:
'''from sklearn import metrics

predsRidge_smote = ridge_smote.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predsRidge_smote)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Ridge Classifier - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()'''
Out[66]:
"from sklearn import metrics\n\npredsRidge_smote = ridge_smote.predict_proba(X_test)[:,1]\nfpr, tpr, _ =  metrics.roc_curve(y_test, predsRidge_smote)\n\nroc_auc = metrics.auc(fpr, tpr)\n\nplt.title('Ridge Classifier - Receiver Operating Characteristic')\nplt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\nplt.legend(loc = 'lower right')\nplt.plot([0,1], [0,1], 'r--')\nplt.xlim([0,1])\nplt.ylim([0,1])\nplt.ylabel('True Positive Rate')\nplt.xlabel('False Positive Rate')\nplt.show()"

Ridge Regression (SMTOEENN)

In [67]:
ridge_smoteenn = RidgeClassifier(random_state=123).fit(X_train_smoteenn, y_train_smoteenn)
pred_ridge_smoteenn = ridge_smoteenn.predict(X_test)

accScore_ridge_smoteenn = accuracy_score(y_test, pred_ridge_smoteenn)
precisionScore_ridge_smoteenn = precision_score(y_test, pred_ridge_smoteenn)
recallScore_ridge_smoteenn = recall_score(y_test, pred_ridge_smoteenn)

print 'Over-Sampling (SMOTEENN) Ridge Regression: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_ridge_smoteenn, b=precisionScore_ridge_smoteenn, c=recallScore_ridge_smoteenn)
Over-Sampling (SMOTEENN) Ridge Regression: Accuracy Score: 0.973372424595.
Precision Score: 0.00462962962963.
Recall Score: 0.142857142857.
In [76]:
#from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = RidgeClassifier()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X_train_smoteenn[train_index], X_train_smoteenn[test_index]
    ytrain, ytest = y_train_smoteenn[train_index], y_train_smoteenn[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Over-Sampling (SMOTEENN) Ridge Classifier - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
TRAIN: [ 24899  24900  24901 ..., 124491 124492 124493] TEST: [    0     1     2 ..., 24896 24897 24898]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [24899 24900 24901 ..., 49795 49796 49797]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [49798 49799 49800 ..., 74694 74695 74696]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [74697 74698 74699 ..., 99593 99594 99595]

TRAIN: [    0     1     2 ..., 99593 99594 99595] TEST: [ 99596  99597  99598 ..., 124491 124492 124493]

Over-Sampling (SMOTEENN) Ridge Classifier - 5 Folds Testing:
Average Accuracy Score: 0.900040854586.
Average Precision Score: 0.352723311547.
Average Recall Score: 0.285042042749.

In [77]:
cnf_matrix = confusion_matrix(y_test, pred_ridge_smoteenn)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Ridge Classifier - Normalized Confusion Matrix')

plt.show()
Normalized Confusion Matrix
[[ 0.97  0.03]
 [ 0.86  0.14]]
In [78]:
from sklearn.metrics import precision_recall_curve

precision, recall, _ = precision_recall_curve(y_test, pred_ridge_smoteenn)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Ridge Classifier - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_ridge_smoteenn))
Out[78]:
<matplotlib.text.Text at 0x2291aeb8>
In [79]:
'''from sklearn import metrics

predsRidge_smoteenn = ridge_smoteenn.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predsRidge_smoteenn)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Ridge Classifier - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()'''
Out[79]:
"from sklearn import metrics\n\npredsRidge_smoteenn = ridge_smoteenn.predict_proba(X_test)[:,1]\nfpr, tpr, _ =  metrics.roc_curve(y_test, predsRidge_smoteenn)\n\nroc_auc = metrics.auc(fpr, tpr)\n\nplt.title('Ridge Classifier - Receiver Operating Characteristic')\nplt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\nplt.legend(loc = 'lower right')\nplt.plot([0,1], [0,1], 'r--')\nplt.xlim([0,1])\nplt.ylim([0,1])\nplt.ylabel('True Positive Rate')\nplt.xlabel('False Positive Rate')\nplt.show()"

Stochastic Gradient Descent (SMOTE)

In [80]:
from sklearn.linear_model import SGDClassifier
In [81]:
sgd_smote = SGDClassifier(random_state=123).fit(X_train_smote, y_train_smote)
pred_sgd_smote = sgd_smote.predict(X_test)

accScore_sgd_smote = accuracy_score(y_test, pred_sgd_smote)
precisionScore_sgd_smote = precision_score(y_test, pred_sgd_smote)
recallScore_sgd_smote = recall_score(y_test, pred_sgd_smote)
In [82]:
print 'Over-Sampling (SMOTE) Stochastic Gradient Descent: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_sgd_smote, b=precisionScore_sgd_smote, c=recallScore_sgd_smote)
Over-Sampling (SMOTE) Stochastic Gradient Descent: Accuracy Score: 0.890316880196.
Precision Score: 0.0032991202346.
Recall Score: 0.428571428571.
In [83]:
#from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = SGDClassifier()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X_train_smote[train_index], X_train_smote[test_index]
    ytrain, ytest = y_train_smote[train_index], y_train_smote[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Over-Sampling (SMOTE) Stochastic Gradient Descent - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
TRAIN: [ 24899  24900  24901 ..., 124491 124492 124493] TEST: [    0     1     2 ..., 24896 24897 24898]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [24899 24900 24901 ..., 49795 49796 49797]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [49798 49799 49800 ..., 74694 74695 74696]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [74697 74698 74699 ..., 99593 99594 99595]

TRAIN: [    0     1     2 ..., 99593 99594 99595] TEST: [ 99596  99597  99598 ..., 124491 124492 124493]

Over-Sampling (SMOTE) Stochastic Gradient Descent - 5 Folds Testing:
Average Accuracy Score: 0.594835133941.
Average Precision Score: 0.00538560195888.
Average Recall Score: 0.394300328827.

In [84]:
cnf_matrix = confusion_matrix(y_test, pred_sgd_smote)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Stochastic Gradient Descent - Normalized Confusion Matrix')

plt.show()
Normalized Confusion Matrix
[[ 0.89  0.11]
 [ 0.57  0.43]]
In [85]:
precision, recall, _ = precision_recall_curve(y_test, pred_sgd_smote)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Stochastic Gradient Descent - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_sgd_smote))
Out[85]:
<matplotlib.text.Text at 0x22d49630>
In [88]:
'''predsSGD_smote = sgd_smote.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predSGD_smote)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Stochastic Gradient Descent - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()'''
Out[88]:
"predsSGD_smote = sgd_smote.predict_proba(X_test)[:,1]\nfpr, tpr, _ =  metrics.roc_curve(y_test, predSGD_smote)\n\nroc_auc = metrics.auc(fpr, tpr)\n\nplt.title('Stochastic Gradient Descent - Receiver Operating Characteristic')\nplt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\nplt.legend(loc = 'lower right')\nplt.plot([0,1], [0,1], 'r--')\nplt.xlim([0,1])\nplt.ylim([0,1])\nplt.ylabel('True Positive Rate')\nplt.xlabel('False Positive Rate')\nplt.show()"

Stochastic Gradient Descent (SMOTEENN)

In [89]:
sgd_smoteenn = SGDClassifier(random_state=123).fit(X_train_smoteenn, y_train_smoteenn)
pred_sgd_smoteenn = sgd_smoteenn.predict(X_test)

accScore_sgd_smoteenn = accuracy_score(y_test, pred_sgd_smoteenn)
precisionScore_sgd_smoteenn = precision_score(y_test, pred_sgd_smoteenn)
recallScore_sgd_smoteenn = recall_score(y_test, pred_sgd_smoteenn)
In [90]:
print 'Over-Sampling (SMOTEENN) Stochastic Gradient Descent: Accuracy Score: {a}.\nPrecision Score: {b}.\nRecall Score: {c}.'.format(a=accScore_sgd_smoteenn, b=precisionScore_sgd_smoteenn, c=recallScore_sgd_smoteenn)
Over-Sampling (SMOTEENN) Stochastic Gradient Descent: Accuracy Score: 0.975460861882.
Precision Score: 0.0066889632107.
Recall Score: 0.190476190476.
In [92]:
#from sklearn.model_selection import KFold

kf = KFold(n_splits = 5)

accuracy = []
precision = []
recall = []

model = SGDClassifier()

for train_index, test_index in kf.split(X):
    print 'TRAIN:', train_index, 'TEST:', test_index, '\n'
    Xtrain, Xtest = X_train_smoteenn[train_index], X_train_smoteenn[test_index]
    ytrain, ytest = y_train_smoteenn[train_index], y_train_smoteenn[test_index]

    pred = model.fit(Xtrain, ytrain).predict(Xtest)

    accuracy.append(accuracy_score(ytest, pred))
    precision.append(precision_score(ytest, pred))
    recall.append(recall_score(ytest, pred))

print 'Over-Sampling (SMOTEENN) Stochastic Gradient Descent - {a} Folds Testing: \nAverage Accuracy Score: {b}.\nAverage Precision Score: {c}.\nAverage Recall Score: {d}.\n'.format(a=kf.get_n_splits(X),b=np.array(accuracy).mean(), c=np.array(precision).mean(), d=np.array(recall).mean())
TRAIN: [ 24899  24900  24901 ..., 124491 124492 124493] TEST: [    0     1     2 ..., 24896 24897 24898]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [24899 24900 24901 ..., 49795 49796 49797]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [49798 49799 49800 ..., 74694 74695 74696]

TRAIN: [     0      1      2 ..., 124491 124492 124493] TEST: [74697 74698 74699 ..., 99593 99594 99595]

TRAIN: [    0     1     2 ..., 99593 99594 99595] TEST: [ 99596  99597  99598 ..., 124491 124492 124493]

Over-Sampling (SMOTEENN) Stochastic Gradient Descent - 5 Folds Testing:
Average Accuracy Score: 0.749499979919.
Average Precision Score: 0.103783783784.
Average Recall Score: 0.0689407540395.

In [93]:
cnf_matrix = confusion_matrix(y_test, pred_sgd_smoteenn)
np.set_printoptions(precision=2)

plt.figure()

class_names = ['Not Failure', 'Failure']
plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='Stochastic Gradient Descent - Normalized Confusion Matrix')

plt.show()
Normalized Confusion Matrix
[[ 0.98  0.02]
 [ 0.81  0.19]]
In [94]:
precision, recall, _ = precision_recall_curve(y_test, pred_sgd_smoteenn)

plt.step(recall, precision, color='b', alpha=0.2, where='post')
plt.fill_between(recall, precision, step='post', alpha=0.2, color='b')

plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('Stochastic Gradient Descent - 2-class Precision-Recall curve: AP={0:0.2f}'.format(precisionScore_sgd_smoteenn))
Out[94]:
<matplotlib.text.Text at 0x437419b0>
In [96]:
'''predsSGD_smoteenn = sgd_smoteenn.predict_proba(X_test)[:,1]
fpr, tpr, _ =  metrics.roc_curve(y_test, predSGD_smoteenn)

roc_auc = metrics.auc(fpr, tpr)

plt.title('Stochastic Gradient Descent - Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
plt.legend(loc = 'lower right')
plt.plot([0,1], [0,1], 'r--')
plt.xlim([0,1])
plt.ylim([0,1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()'''
Out[96]:
"predsSGD_smoteenn = sgd_smoteenn.predict_proba(X_test)[:,1]\nfpr, tpr, _ =  metrics.roc_curve(y_test, predSGD_smoteenn)\n\nroc_auc = metrics.auc(fpr, tpr)\n\nplt.title('Stochastic Gradient Descent - Receiver Operating Characteristic')\nplt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)\nplt.legend(loc = 'lower right')\nplt.plot([0,1], [0,1], 'r--')\nplt.xlim([0,1])\nplt.ylim([0,1])\nplt.ylabel('True Positive Rate')\nplt.xlabel('False Positive Rate')\nplt.show()"

rss facebook twitter github youtube mail spotify lastfm instagram linkedin google google-plus pinterest medium vimeo stackoverflow reddit quora quora