Service Disruptions Predictions

Project 0 - Service Disruptions

Project - Service Disruptions

Purpose/Goal:

To use the given data to create a classification algorithm that will accurately predict the fault severity of device so as to determine which devices need immediate maintenance.

In [1]:
import numpy as np
from sklearn.cross_validation import train_test_split as tts
import pandas as pd
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
In [2]:
dfEvent = pd.read_csv('event_type.csv')
dfLog = pd.read_csv('log_feature.csv')
dfResource = pd.read_csv('resource_type.csv')
dfSeverity = pd.read_csv('severity_type.csv')
dfTrain = pd.read_csv('train.csv')
In [3]:
# Merge the DataFrames together into one DataFrame

# merge allows you to merge data frames based on values in a row or columns
print dfTrain.nunique()

df1 = pd.merge(dfLog,dfEvent,on='id')
df2 = pd.merge(df1,dfResource,on='id')
df3 = pd.merge(df2,dfSeverity,on='id')
df = pd.merge(dfTrain,df3,on='id')
df.head()
id                7381
location           929
fault_severity       3
dtype: int64
Out[3]:
id location fault_severity log_feature volume event_type resource_type severity_type
0 14121 location 118 1 feature 312 19 event_type 34 resource_type 2 severity_type 2
1 14121 location 118 1 feature 312 19 event_type 35 resource_type 2 severity_type 2
2 14121 location 118 1 feature 232 19 event_type 34 resource_type 2 severity_type 2
3 14121 location 118 1 feature 232 19 event_type 35 resource_type 2 severity_type 2
4 9320 location 91 0 feature 315 200 event_type 34 resource_type 2 severity_type 2
In [4]:
# Strip Features So they are just numbers

df.location = df.location.str.replace('location ','')
df['log_feature'] = df.log_feature.str.replace('feature ','')
df['event_type'] = df.event_type.str.replace('event_type ','')
df['resource_type'] = df.resource_type.str.replace('resource_type ','')
df['severity_type'] = df.severity_type.str.replace('severity_type ','')
print df['location'].nunique()
df.head(10)
929
Out[4]:
id location fault_severity log_feature volume event_type resource_type severity_type
0 14121 118 1 312 19 34 2 2
1 14121 118 1 312 19 35 2 2
2 14121 118 1 232 19 34 2 2
3 14121 118 1 232 19 35 2 2
4 9320 91 0 315 200 34 2 2
5 9320 91 0 315 200 35 2 2
6 9320 91 0 235 116 34 2 2
7 9320 91 0 235 116 35 2 2
8 14394 152 1 221 1 35 2 2
9 14394 152 1 221 1 34 2 2
In [5]:
dfLocation = df.location
y = df[['location','fault_severity']]
df.drop(['fault_severity','id'],axis=1,inplace=True)
df.head()
Out[5]:
location log_feature volume event_type resource_type severity_type
0 118 312 19 34 2 2
1 118 312 19 35 2 2
2 118 232 19 34 2 2
3 118 232 19 35 2 2
4 91 315 200 34 2 2
In [6]:
df = pd.get_dummies(df,columns=['location','log_feature','event_type','resource_type','severity_type'])
In [7]:
df['location'] = dfLocation
In [8]:
df = df.groupby('location').sum()
y = y.groupby('location').mean().round()
print df.head()
print y.head(20)
          volume  location_1  location_10  location_100  location_1000  \
location
1            664       163.0          0.0           0.0            0.0
10            20         0.0          2.0           0.0            0.0
100          246         0.0          0.0          96.0            0.0
1000          29         0.0          0.0           0.0           29.0
1002           2         0.0          0.0           0.0            0.0

          location_1002  location_1005  location_1006  location_1007  \
location
1                   0.0            0.0            0.0            0.0
10                  0.0            0.0            0.0            0.0
100                 0.0            0.0            0.0            0.0
1000                0.0            0.0            0.0            0.0
1002                2.0            0.0            0.0            0.0

          location_1008       ...         resource_type_5  resource_type_6  \
location                      ...
1                   0.0       ...                     0.0              0.0
10                  0.0       ...                     0.0              0.0
100                 0.0       ...                     0.0              0.0
1000                0.0       ...                     0.0              0.0
1002                0.0       ...                     0.0              0.0

          resource_type_7  resource_type_8  resource_type_9  severity_type_1  \
location
1                     0.0            142.0              0.0            150.0
10                    0.0              2.0              0.0              2.0
100                   0.0              0.0              0.0             13.0
1000                  0.0             15.0              0.0             26.0
1002                  0.0              0.0              0.0              2.0

          severity_type_2  severity_type_3  severity_type_4  severity_type_5
location
1                    13.0              0.0              0.0              0.0
10                    0.0              0.0              0.0              0.0
100                  78.0              0.0              5.0              0.0
1000                  3.0              0.0              0.0              0.0
1002                  0.0              0.0              0.0              0.0

[5 rows x 1325 columns]
          fault_severity
location
1                    1.0
10                   0.0
100                  0.0
1000                 0.0
1002                 0.0
1005                 0.0
1006                 0.0
1007                 0.0
1008                 1.0
1009                 0.0
101                  0.0
1010                 0.0
1011                 1.0
1013                 0.0
1014                 1.0
1015                 1.0
1016                 1.0
1017                 1.0
1018                 0.0
1019                 1.0
In [9]:
print len(df.columns)
print len(df)
1325
929
In [10]:
for c in np.arange(1,len(df.columns)):
    for r in np.arange(0,len(df)):
        if df.iloc[r,c]>0:
            df.iloc[r,c]=1

df.head(10)
Out[10]:
volume location_1 location_10 location_100 location_1000 location_1002 location_1005 location_1006 location_1007 location_1008 ... resource_type_5 resource_type_6 resource_type_7 resource_type_8 resource_type_9 severity_type_1 severity_type_2 severity_type_3 severity_type_4 severity_type_5
location
1 664 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0
10 20 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 1.0 0.0 0.0 0.0 0.0
100 246 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 1.0 0.0 1.0 0.0
1000 29 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0
1002 2 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
1005 2 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 0.0 0.0
1006 11 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0
1007 340 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 0.0 ... 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0
1008 2670 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 1.0 ... 0.0 0.0 0.0 1.0 0.0 1.0 1.0 0.0 0.0 0.0
1009 6 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 1.0 1.0 1.0 0.0 0.0 0.0 1.0

10 rows × 1325 columns

In [11]:
X_train, X_test, y_train, y_test = tts(df,y.fault_severity,train_size=0.8)

Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import accuracy_score, f1_score

modelRF = RFC(n_estimators=20)
modelRF = modelRF.fit(X_train,y_train)
predictionRF = modelRF.predict(X_test)
accScoreRF = accuracy_score(y_test,predictionRF)
#f1ScoreRF = f1_score(y_test,predictionRF)

print accScoreRF, #f1ScoreRF
0.709677419355
In [13]:
businessModelRF = pd.DataFrame()
businessModelRF['Location'] = np.sort(dfLocation.unique())
businessModelRF['Predicted'] = pd.Series(modelRF.predict(df))
predictedProbasRF = pd.DataFrame(modelRF.predict_proba(df))
businessModelRF['0 Probability'] = predictedProbasRF[:][0]
businessModelRF['1 Probability'] = predictedProbasRF[:][1]
businessModelRF['2 Probability'] = predictedProbasRF[:][2]

businessModelRF.head()
Out[13]:
Location Predicted 0 Probability 1 Probability 2 Probability
0 1 1.0 0.00 0.90 0.10
1 10 0.0 0.85 0.15 0.00
2 100 0.0 1.00 0.00 0.00
3 1000 0.0 0.70 0.15 0.15
4 1002 0.0 0.85 0.15 0.00
In [14]:
sortedBusinessModelRF = businessModelRF.sort_values(by='Predicted',ascending=False)
sortedBusinessModelRF.head()
Out[14]:
Location Predicted 0 Probability 1 Probability 2 Probability
630 670 2.0 0.30 0.15 0.55
605 643 2.0 0.05 0.25 0.70
596 632 2.0 0.30 0.05 0.65
865 926 2.0 0.05 0.25 0.70
696 745 2.0 0.10 0.30 0.60

Gradient Boosting

In [15]:
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import grid_search as GS

modelGB = GBC(n_estimators = 10)
modelGB = modelGB.fit(X_train,y_train)
print modelGB.score(X_test,y_test)
predictionGB = modelGB.predict(X_test)
accScoreGB = accuracy_score(y_test,predictionGB)
#f1ScoreGb = f1_score(y_test,predictionGB)

modelCVGB = GS.GridSearchCV(modelGB,{'n_estimators':[5]})
modelCVGB = modelCVGB.fit(X_train,y_train)
predictionGSGB = modelCVGB.predict(X_test)
accScoreGSGB = accuracy_score(y_test,predictionGSGB)

print accScoreGB, accScoreGSGB #f1ScoreGb
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)
0.704301075269
0.704301075269 0.693548387097
In [16]:
businessModelGB = pd.DataFrame()
businessModelGB['Location'] = np.sort(dfLocation.unique())
businessModelGB['Predicted'] = pd.Series(modelGB.predict(df))
predictedProbasGB = pd.DataFrame(modelGB.predict_proba(df))
businessModelGB['0 Probability'] = predictedProbasGB[:][0]
businessModelGB['1 Probability'] = predictedProbasGB[:][1]
businessModelGB['2 Probability'] = predictedProbasGB[:][2]

businessModelGB.head()
Out[16]:
Location Predicted 0 Probability 1 Probability 2 Probability
0 1 1.0 0.225045 0.656194 0.118760
1 10 0.0 0.557079 0.319453 0.123468
2 100 0.0 0.693160 0.213448 0.093391
3 1000 0.0 0.576869 0.305180 0.117951
4 1002 0.0 0.625789 0.269896 0.104314
In [17]:
sortedBusinessModelGB = businessModelGB.sort_values(by='Predicted',ascending=False)
sortedBusinessModelGB.head()
Out[17]:
Location Predicted 0 Probability 1 Probability 2 Probability
801 86 2.0 0.384476 0.220475 0.395049
577 610 2.0 0.359280 0.221492 0.419228
593 628 2.0 0.378041 0.215007 0.406952
628 668 2.0 0.247362 0.141848 0.610791
892 956 2.0 0.359280 0.221492 0.419228

Logistic Regression

In [18]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
logmodel = lr.fit(X_train,y_train)
predictionLR = logmodel.predict(X_test)
accScoreLR = accuracy_score(y_test,predictionLR)
#f1ScoreLR = f1_score(y_test,predictionLR)

print accScoreLR, #f1ScoreLR
0.725806451613
In [19]:
businessModelLR = pd.DataFrame()
businessModelLR['Location'] = np.sort(dfLocation.unique())
businessModelLR['Predicted'] = pd.Series(logmodel.predict(df))
predictedProbasLR = pd.DataFrame(logmodel.predict_proba(df))
businessModelLR['0 Probability'] = predictedProbasLR[:][0]
businessModelLR['1 Probability'] = predictedProbasLR[:][1]
businessModelLR['2 Probability'] = predictedProbasLR[:][2]

businessModelLR.head()
Out[19]:
Location Predicted 0 Probability 1 Probability 2 Probability
0 1 1.0 0.035979 0.917412 0.046609
1 10 0.0 0.608277 0.365805 0.025918
2 100 0.0 0.888145 0.111467 0.000388
3 1000 0.0 0.648262 0.290779 0.060959
4 1002 0.0 0.715898 0.257207 0.026895
In [20]:
sortedBusinessModelLR = businessModelLR.sort_values(by='Predicted',ascending=False)
sortedBusinessModelLR.head()
Out[20]:
Location Predicted 0 Probability 1 Probability 2 Probability
538 559 2.0 0.197968 0.169599 0.632434
604 642 2.0 0.129807 0.328021 0.542172
21 1020 2.0 0.050065 0.312304 0.637631
22 1021 2.0 0.073759 0.364305 0.561935
804 864 2.0 0.178906 0.111766 0.709328
In [ ]:

In [ ]:

rss facebook twitter github youtube mail spotify lastfm instagram linkedin google google-plus pinterest medium vimeo stackoverflow reddit quora quora