Project - Service Disruptions¶

Purpose/Goal:¶

To use the given data to create a classification algorithm that will accurately predict the fault severity of device so as to determine which devices need immediate maintenance.

import numpy as np
from sklearn.cross_validation import train_test_split as tts
import pandas as pd

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

dfEvent = pd.read_csv('event_type.csv')
dfLog = pd.read_csv('log_feature.csv')
dfResource = pd.read_csv('resource_type.csv')
dfSeverity = pd.read_csv('severity_type.csv')
dfTrain = pd.read_csv('train.csv')

# Merge the DataFrames together into one DataFrame

# merge allows you to merge data frames based on values in a row or columns
print dfTrain.nunique()

df1 = pd.merge(dfLog,dfEvent,on='id')
df2 = pd.merge(df1,dfResource,on='id')
df3 = pd.merge(df2,dfSeverity,on='id')
df = pd.merge(dfTrain,df3,on='id')
df.head()

id                7381
location           929
fault_severity       3
dtype: int64

# Strip Features So they are just numbers

df.location = df.location.str.replace('location ','')
df['log_feature'] = df.log_feature.str.replace('feature ','')
df['event_type'] = df.event_type.str.replace('event_type ','')
df['resource_type'] = df.resource_type.str.replace('resource_type ','')
df['severity_type'] = df.severity_type.str.replace('severity_type ','')
print df['location'].nunique()
df.head(10)

929

dfLocation = df.location
y = df[['location','fault_severity']]
df.drop(['fault_severity','id'],axis=1,inplace=True)
df.head()

df = pd.get_dummies(df,columns=['location','log_feature','event_type','resource_type','severity_type'])

df['location'] = dfLocation

df = df.groupby('location').sum()
y = y.groupby('location').mean().round()
print df.head()
print y.head(20)

          volume  location_1  location_10  location_100  location_1000  \
location
1            664       163.0          0.0           0.0            0.0
10            20         0.0          2.0           0.0            0.0
100          246         0.0          0.0          96.0            0.0
1000          29         0.0          0.0           0.0           29.0
1002           2         0.0          0.0           0.0            0.0

          location_1002  location_1005  location_1006  location_1007  \
location
1                   0.0            0.0            0.0            0.0
10                  0.0            0.0            0.0            0.0
100                 0.0            0.0            0.0            0.0
1000                0.0            0.0            0.0            0.0
1002                2.0            0.0            0.0            0.0

          location_1008       ...         resource_type_5  resource_type_6  \
location                      ...
1                   0.0       ...                     0.0              0.0
10                  0.0       ...                     0.0              0.0
100                 0.0       ...                     0.0              0.0
1000                0.0       ...                     0.0              0.0
1002                0.0       ...                     0.0              0.0

          resource_type_7  resource_type_8  resource_type_9  severity_type_1  \
location
1                     0.0            142.0              0.0            150.0
10                    0.0              2.0              0.0              2.0
100                   0.0              0.0              0.0             13.0
1000                  0.0             15.0              0.0             26.0
1002                  0.0              0.0              0.0              2.0

          severity_type_2  severity_type_3  severity_type_4  severity_type_5
location
1                    13.0              0.0              0.0              0.0
10                    0.0              0.0              0.0              0.0
100                  78.0              0.0              5.0              0.0
1000                  3.0              0.0              0.0              0.0
1002                  0.0              0.0              0.0              0.0

[5 rows x 1325 columns]
          fault_severity
location
1                    1.0
10                   0.0
100                  0.0
1000                 0.0
1002                 0.0
1005                 0.0
1006                 0.0
1007                 0.0
1008                 1.0
1009                 0.0
101                  0.0
1010                 0.0
1011                 1.0
1013                 0.0
1014                 1.0
1015                 1.0
1016                 1.0
1017                 1.0
1018                 0.0
1019                 1.0

print len(df.columns)
print len(df)

1325
929

for c in np.arange(1,len(df.columns)):
    for r in np.arange(0,len(df)):
        if df.iloc[r,c]>0:
            df.iloc[r,c]=1

df.head(10)

X_train, X_test, y_train, y_test = tts(df,y.fault_severity,train_size=0.8)

Random Forest¶

from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import accuracy_score, f1_score

modelRF = RFC(n_estimators=20)
modelRF = modelRF.fit(X_train,y_train)
predictionRF = modelRF.predict(X_test)
accScoreRF = accuracy_score(y_test,predictionRF)
#f1ScoreRF = f1_score(y_test,predictionRF)

print accScoreRF, #f1ScoreRF

0.709677419355

businessModelRF = pd.DataFrame()
businessModelRF['Location'] = np.sort(dfLocation.unique())
businessModelRF['Predicted'] = pd.Series(modelRF.predict(df))
predictedProbasRF = pd.DataFrame(modelRF.predict_proba(df))
businessModelRF['0 Probability'] = predictedProbasRF[:][0]
businessModelRF['1 Probability'] = predictedProbasRF[:][1]
businessModelRF['2 Probability'] = predictedProbasRF[:][2]

businessModelRF.head()

sortedBusinessModelRF = businessModelRF.sort_values(by='Predicted',ascending=False)
sortedBusinessModelRF.head()

Gradient Boosting¶

from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import grid_search as GS

modelGB = GBC(n_estimators = 10)
modelGB = modelGB.fit(X_train,y_train)
print modelGB.score(X_test,y_test)
predictionGB = modelGB.predict(X_test)
accScoreGB = accuracy_score(y_test,predictionGB)
#f1ScoreGb = f1_score(y_test,predictionGB)

modelCVGB = GS.GridSearchCV(modelGB,{'n_estimators':[5]})
modelCVGB = modelCVGB.fit(X_train,y_train)
predictionGSGB = modelCVGB.predict(X_test)
accScoreGSGB = accuracy_score(y_test,predictionGSGB)

print accScoreGB, accScoreGSGB #f1ScoreGb

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\grid_search.py:42: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. This module will be removed in 0.20.
  DeprecationWarning)

0.704301075269
0.704301075269 0.693548387097

businessModelGB = pd.DataFrame()
businessModelGB['Location'] = np.sort(dfLocation.unique())
businessModelGB['Predicted'] = pd.Series(modelGB.predict(df))
predictedProbasGB = pd.DataFrame(modelGB.predict_proba(df))
businessModelGB['0 Probability'] = predictedProbasGB[:][0]
businessModelGB['1 Probability'] = predictedProbasGB[:][1]
businessModelGB['2 Probability'] = predictedProbasGB[:][2]

businessModelGB.head()

sortedBusinessModelGB = businessModelGB.sort_values(by='Predicted',ascending=False)
sortedBusinessModelGB.head()

Logistic Regression¶

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
logmodel = lr.fit(X_train,y_train)
predictionLR = logmodel.predict(X_test)
accScoreLR = accuracy_score(y_test,predictionLR)
#f1ScoreLR = f1_score(y_test,predictionLR)

print accScoreLR, #f1ScoreLR

0.725806451613

businessModelLR = pd.DataFrame()
businessModelLR['Location'] = np.sort(dfLocation.unique())
businessModelLR['Predicted'] = pd.Series(logmodel.predict(df))
predictedProbasLR = pd.DataFrame(logmodel.predict_proba(df))
businessModelLR['0 Probability'] = predictedProbasLR[:][0]
businessModelLR['1 Probability'] = predictedProbasLR[:][1]
businessModelLR['2 Probability'] = predictedProbasLR[:][2]

businessModelLR.head()

sortedBusinessModelLR = businessModelLR.sort_values(by='Predicted',ascending=False)
sortedBusinessModelLR.head()

	id	location	fault_severity	log_feature	volume	event_type	resource_type	severity_type
0	14121	location 118	1	feature 312	19	event_type 34	resource_type 2	severity_type 2
1	14121	location 118	1	feature 312	19	event_type 35	resource_type 2	severity_type 2
2	14121	location 118	1	feature 232	19	event_type 34	resource_type 2	severity_type 2
3	14121	location 118	1	feature 232	19	event_type 35	resource_type 2	severity_type 2
4	9320	location 91	0	feature 315	200	event_type 34	resource_type 2	severity_type 2

	id	location	fault_severity	log_feature	volume	event_type	resource_type	severity_type
0	14121	118	1	312	19	34	2	2
1	14121	118	1	312	19	35	2	2
2	14121	118	1	232	19	34	2	2
3	14121	118	1	232	19	35	2	2
4	9320	91	0	315	200	34	2	2
5	9320	91	0	315	200	35	2	2
6	9320	91	0	235	116	34	2	2
7	9320	91	0	235	116	35	2	2
8	14394	152	1	221	1	35	2	2
9	14394	152	1	221	1	34	2	2

	location	log_feature	volume	event_type	resource_type	severity_type
0	118	312	19	34	2	2
1	118	312	19	35	2	2
2	118	232	19	34	2	2
3	118	232	19	35	2	2
4	91	315	200	34	2	2

	Location	Predicted	0 Probability	1 Probability	2 Probability
630	670	2.0	0.30	0.15	0.55
605	643	2.0	0.05	0.25	0.70
596	632	2.0	0.30	0.05	0.65
865	926	2.0	0.05	0.25	0.70
696	745	2.0	0.10	0.30	0.60

	Location	Predicted	0 Probability	1 Probability	2 Probability
0	1	1.0	0.225045	0.656194	0.118760
1	10	0.0	0.557079	0.319453	0.123468
2	100	0.0	0.693160	0.213448	0.093391
3	1000	0.0	0.576869	0.305180	0.117951
4	1002	0.0	0.625789	0.269896	0.104314

	location	log_feature	volume	event_type	resource_type	severity_type
0	118	312	19	34	2	2
1	118	312	19	35	2	2
2	118	232	19	34	2	2
3	118	232	19	35	2	2
4	91	315	200	34	2	2

	Location	Predicted	0 Probability	1 Probability	2 Probability
0	1	1.0	0.00	0.90	0.10
1	10	0.0	0.85	0.15	0.00
2	100	0.0	1.00	0.00	0.00
3	1000	0.0	0.70	0.15	0.15
4	1002	0.0	0.85	0.15	0.00

	Location	Predicted	0 Probability	1 Probability	2 Probability
801	86	2.0	0.384476	0.220475	0.395049
577	610	2.0	0.359280	0.221492	0.419228
593	628	2.0	0.378041	0.215007	0.406952
628	668	2.0	0.247362	0.141848	0.610791
892	956	2.0	0.359280	0.221492	0.419228

	Location	Predicted	0 Probability	1 Probability	2 Probability
0	1	1.0	0.035979	0.917412	0.046609
1	10	0.0	0.608277	0.365805	0.025918
2	100	0.0	0.888145	0.111467	0.000388
3	1000	0.0	0.648262	0.290779	0.060959
4	1002	0.0	0.715898	0.257207	0.026895

	Location	Predicted	0 Probability	1 Probability	2 Probability
538	559	2.0	0.197968	0.169599	0.632434
604	642	2.0	0.129807	0.328021	0.542172
21	1020	2.0	0.050065	0.312304	0.637631
22	1021	2.0	0.073759	0.364305	0.561935
804	864	2.0	0.178906	0.111766	0.709328

	location	log_feature	volume	event_type	resource_type	severity_type
0	118	312	19	34	2	2
1	118	312	19	35	2	2
2	118	232	19	34	2	2
3	118	232	19	35	2	2
4	91	315	200	34	2	2

Service Disruptions Predictions

Project - Service Disruptions¶

Purpose/Goal:¶

Random Forest¶

Gradient Boosting¶

Logistic Regression¶

	location	log_feature	volume	event_type	resource_type	severity_type
0	118	312	19	34	2	2
1	118	312	19	35	2	2
2	118	232	19	34	2	2
3	118	232	19	35	2	2
4	91	315	200	34	2	2