To use the given data to create a classification algorithm that will accurately predict the fault severity of device so as to determine which devices need immediate maintenance.
import numpy as np
from sklearn.cross_validation import train_test_split as tts
import pandas as pd
dfEvent = pd.read_csv('event_type.csv')
dfLog = pd.read_csv('log_feature.csv')
dfResource = pd.read_csv('resource_type.csv')
dfSeverity = pd.read_csv('severity_type.csv')
dfTrain = pd.read_csv('train.csv')
# Merge the DataFrames together into one DataFrame
# merge allows you to merge data frames based on values in a row or columns
print dfTrain.nunique()
df1 = pd.merge(dfLog,dfEvent,on='id')
df2 = pd.merge(df1,dfResource,on='id')
df3 = pd.merge(df2,dfSeverity,on='id')
df = pd.merge(dfTrain,df3,on='id')
df.head()
# Strip Features So they are just numbers
df.location = df.location.str.replace('location ','')
df['log_feature'] = df.log_feature.str.replace('feature ','')
df['event_type'] = df.event_type.str.replace('event_type ','')
df['resource_type'] = df.resource_type.str.replace('resource_type ','')
df['severity_type'] = df.severity_type.str.replace('severity_type ','')
print df['location'].nunique()
df.head(10)
dfLocation = df.location
y = df[['location','fault_severity']]
df.drop(['fault_severity','id'],axis=1,inplace=True)
df.head()
df = pd.get_dummies(df,columns=['location','log_feature','event_type','resource_type','severity_type'])
df['location'] = dfLocation
df = df.groupby('location').sum()
y = y.groupby('location').mean().round()
print df.head()
print y.head(20)
print len(df.columns)
print len(df)
for c in np.arange(1,len(df.columns)):
for r in np.arange(0,len(df)):
if df.iloc[r,c]>0:
df.iloc[r,c]=1
df.head(10)
X_train, X_test, y_train, y_test = tts(df,y.fault_severity,train_size=0.8)
from sklearn.ensemble import RandomForestClassifier as RFC
from sklearn.metrics import accuracy_score, f1_score
modelRF = RFC(n_estimators=20)
modelRF = modelRF.fit(X_train,y_train)
predictionRF = modelRF.predict(X_test)
accScoreRF = accuracy_score(y_test,predictionRF)
#f1ScoreRF = f1_score(y_test,predictionRF)
print accScoreRF, #f1ScoreRF
businessModelRF = pd.DataFrame()
businessModelRF['Location'] = np.sort(dfLocation.unique())
businessModelRF['Predicted'] = pd.Series(modelRF.predict(df))
predictedProbasRF = pd.DataFrame(modelRF.predict_proba(df))
businessModelRF['0 Probability'] = predictedProbasRF[:][0]
businessModelRF['1 Probability'] = predictedProbasRF[:][1]
businessModelRF['2 Probability'] = predictedProbasRF[:][2]
businessModelRF.head()
sortedBusinessModelRF = businessModelRF.sort_values(by='Predicted',ascending=False)
sortedBusinessModelRF.head()
from sklearn.ensemble import GradientBoostingClassifier as GBC
from sklearn import grid_search as GS
modelGB = GBC(n_estimators = 10)
modelGB = modelGB.fit(X_train,y_train)
print modelGB.score(X_test,y_test)
predictionGB = modelGB.predict(X_test)
accScoreGB = accuracy_score(y_test,predictionGB)
#f1ScoreGb = f1_score(y_test,predictionGB)
modelCVGB = GS.GridSearchCV(modelGB,{'n_estimators':[5]})
modelCVGB = modelCVGB.fit(X_train,y_train)
predictionGSGB = modelCVGB.predict(X_test)
accScoreGSGB = accuracy_score(y_test,predictionGSGB)
print accScoreGB, accScoreGSGB #f1ScoreGb
businessModelGB = pd.DataFrame()
businessModelGB['Location'] = np.sort(dfLocation.unique())
businessModelGB['Predicted'] = pd.Series(modelGB.predict(df))
predictedProbasGB = pd.DataFrame(modelGB.predict_proba(df))
businessModelGB['0 Probability'] = predictedProbasGB[:][0]
businessModelGB['1 Probability'] = predictedProbasGB[:][1]
businessModelGB['2 Probability'] = predictedProbasGB[:][2]
businessModelGB.head()
sortedBusinessModelGB = businessModelGB.sort_values(by='Predicted',ascending=False)
sortedBusinessModelGB.head()
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
logmodel = lr.fit(X_train,y_train)
predictionLR = logmodel.predict(X_test)
accScoreLR = accuracy_score(y_test,predictionLR)
#f1ScoreLR = f1_score(y_test,predictionLR)
print accScoreLR, #f1ScoreLR
businessModelLR = pd.DataFrame()
businessModelLR['Location'] = np.sort(dfLocation.unique())
businessModelLR['Predicted'] = pd.Series(logmodel.predict(df))
predictedProbasLR = pd.DataFrame(logmodel.predict_proba(df))
businessModelLR['0 Probability'] = predictedProbasLR[:][0]
businessModelLR['1 Probability'] = predictedProbasLR[:][1]
businessModelLR['2 Probability'] = predictedProbasLR[:][2]
businessModelLR.head()
sortedBusinessModelLR = businessModelLR.sort_values(by='Predicted',ascending=False)
sortedBusinessModelLR.head()