Types of Algorithms to use and when:
Continuous Variables = Regression Analysis Categorical Variables/Data = Classification Algorithms Clustering Algorithms
Algorithm = Input (Independent Variables) -> Model -> Output
How to Build a Mode: 1) Get Data Set 2) Clean Data 3) Split Data 4) Train Model 5) Iterate Unit Model is Optimized 6) Test Model 7) Make Predictions on New Data with optimized model
In order to test your model, you use a percentage of the data (about 80%) for training your model and the remaining percentage (about 20%) to test the accuracy of your model.
import numpy as np
from sklearn.cross_validation import train_test_split
y = np.arange(0,5)
x = np.arange(0,10).reshape(5,2)
# 60% training data, 40% testing data
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.40)
print x_train
print x_test
print y_train
print y_test
# 60% training data, 40% testing data
# Random State gives you a random set of train and testing value, but remembers how the data was broken up for validation purposes later
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = 0.40, random_state = 0)
print x_train
print x_test
print y_train
print y_test
Cross Validation - Run the model on different breaks of the data and average the multiple folds of the model
from sklearn.cross_validation import KFold
x = np.arange(16).reshape(8,2)
y = np.arange(8)
print x
print y
kf = KFold(len(x),n_folds=4)
print len(kf)
print len(x)
for train_index, test_index in kf:
print 'TRAIN:',train_index, 'TEST:',test_index
Most common Regression Techniques: 1) Multiple Regressions 2) Lasso Regression 3) Ridge Regression
Linear Regression: r^2 = correlation between the 2 variables (value from -1 to 1, low correlation to high correlation) R^2 = how close the data points are to the least squares line (value from 0 to 1, not close to very close)
%matplotlib inline
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv('heightweight.csv')
df.head(10)
df.info()
df.describe()
df.isnull().sum()
df.sex.nunique()
df.sex.value_counts()
# Creating Dummy Variables for the Columnn 'Sex' because 'Sex' is an object
df_dummies = pd.get_dummies(df,drop_first=True)
# Only creates dummy variables for 'Object' Class
# drop_first = True, drops the one column of dummy variables because the other couumn is implied
df_dummies.head()
df_dummies.corr()
# Gives you the correlation between the different variables
plt.hist(df.heightIn,30)
plt.scatter(df.weightLb,df.heightIn)
df[df['weightLb']>171]
from scipy import stats
estheight = stats.linregress(df.weightLb,df.heightIn)
estheight
def predict(x):
return estheight.slope*x + estheight.intercept
# Creating a new column called 'Predicted Height' based off of least squares line
df['Predicted Height'] = predict(df.weightLb)
df['Predicted Height']
df.head()
df['Height Error'] = abs(df['heightIn'] - df['Predicted Height'])
df.head()
from sklearn.metrics import mean_squared_error, r2_score
(mean_squared_error(df.heightIn,df['Predicted Height']))**0.5
r2_score(df.heightIn,df['Predicted Height'])
df = pd.read_excel('cars.xls')
df.head()
df.info()
df.nunique()
df.describe()
df.corr()
# Input for model in x-axis
x = df[['Mileage','Cylinder','Liter','Cruise']]
# Predict on the Y-Axis is the price
y = df[['Price']]
print x.head()
y.head()
import statsmodels.api as sm
import pandas as pd
import matplotlib.pyplot as plt
import seaborn
# Adding in a constant for a y-intercept. Will obtain a y-intercept later
x1 = sm.add_constant(x)
x1.head()
# Stats Module. Ordinary Least Squares
# Fit
est =sm.OLS(y,x1)
est_price = est.fit()
est_price.summary()
# Use Built-In Function in Stats Model to give you the
y_pred = est_price.predict(x1)
type(y_pred)
x1['Predicted Price'] = y_pred
x1.head()
from sklearn.metrics import mean_squared_error, r2_score
(mean_squared_error(df['Price'],x1['Predicted Price']))**0.5
r2_score(df['Price'],x1['Predicted Price'])
x1.drop('Liter',axis=1,inplace=True)
x1.head()
print mean_squared_error(df['Price'],x1['Predicted Price'])**0.5
r2_score(df['Price'],x1['Predicted Price'])
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x1,y,train_size=0.8)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
Steps: 1) Define the Algorithm 2) Fit the Algorithm 3) Predict from the Algorithm
from sklearn import linear_model
# Linear Regression
reg = linear_model.LinearRegression()
regmodel = reg.fit(X_train,y_train)
y_predtest = regmodel.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(y_test, y_predtest)**0.5
r2_score(y_test,y_predtest)
print regmodel.intercept_
print regmodel.coef_
from sklearn.linear_model import Ridge,Lasso
# Create Model using Ridge Regression
ridgereg = linear_model.Ridge()
ridgereg.fit(X_train,y_train)
# Get Predicted Values using Ridge Regression Model
y_pred_ridge = ridgereg.predict(X_test)
# Test Predicted Values using Ridge Regression Model
print mean_squared_error(y_test,y_pred_ridge)**0.5
r2_score(y_test,y_pred_ridge)
# Creat Model using Lasso Regression
lassoreg = linear_model.Lasso()
lassoreg.fit(X_train,y_train)
# Get Predicted Values using Lasso Regression Model
y_pred_lasso = lassoreg.predict(X_test)
# Test Predicted Values using Lasso Regression Model
print mean_squared_error(y_test,y_pred_lasso)**0.5
r2_score(y_test,y_pred_lasso)
# Correlation Matrix
# Select Best Variables
# Remove Text Variables
# Shooting for an error of less than $100k
df = pd.read_csv('kc_house_data.csv')
df.head()
df['age'] = 2017 - df['yr_built']
df['renovated?'] = df['yr_renovated'].apply(lambda i: 1 if i>0 else 0)
df['basement?'] = df['sqft_basement'].apply(lambda i: 1 if i>0 else 0)
df['greaterThan15?'] = df['sqft_living'] - df['sqft_living15']
df['bathrooms'] = df['bathrooms']*1000
x = pd.get_dummies(df,columns=['zipcode'],drop_first=True)
# x = pd.get_dummies(df,columns=['lat'],drop_first=True)
# Will not print when I include 'lat' and 'long' in get_dummies. Too much info?
# Eliminating Outlier Housing Prices
high = np.percentile(x.price,90)
low = np.percentile(x.price,10)
x_no_outs = x[(x.price < high) & (x.price > low)]
x_no_outs.head()
x_no_outs.info()
x_no_outs.corr()
# Predict on the Y-Axis is the price
y = x_no_outs[['price']]
# Input for model in x-axis
x_no_outs.drop(['date','id','price','age'],axis=1,inplace=True)
#,'yr_built','condition','waterfront'
x_no_outs = sm.add_constant(x_no_outs)
x_no_outs.head()
# Stats Module. Ordinary Least Squares
# Fit
est =sm.OLS(y,x_no_outs)
est_price = est.fit()
est_price.summary()
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(x_no_outs,y,train_size=0.8)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
from sklearn import linear_model
# Linear Regression
reg = linear_model.LinearRegression()
regmodel = reg.fit(X_train,y_train)
y_predtest = regmodel.predict(X_test)
from sklearn.metrics import mean_squared_error, r2_score
mean_squared_error(y_test, y_predtest)**0.5
r2_score(y_test,y_predtest)
from sklearn.linear_model import Ridge,Lasso
# Create Model using Ridge Regression
ridgereg = linear_model.Ridge()
ridgereg.fit(X_train,y_train)
# Get Predicted Values using Ridge Regression Model
y_pred_ridge = ridgereg.predict(X_test)
# Test Predicted Values using Ridge Regression Model
print mean_squared_error(y_test,y_pred_ridge)**0.5
r2_score(y_test,y_pred_ridge)
# Creat Model using Lasso Regression
lassoreg = linear_model.Lasso()
lassoreg.fit(X_train,y_train)
# Get Predicted Values using Lasso Regression Model
y_pred_lasso = lassoreg.predict(X_test)
# Test Predicted Values using Lasso Regression Model
print mean_squared_error(y_test,y_pred_lasso)**0.5
r2_score(y_test,y_pred_lasso)