Clustering (Unsupervised Learning)¶

K - Means Algorithm¶

k = number of clusters The algorithm creates k centroids and calculates the means of the distances from the centroid to the distances of the data points and continues to do so until the calculated means do not change or the centroids do not switch between groups of data.

from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

%matplotlib inline

data = pd.read_excel('cars.xls')
data.head()

x = data[['Price','Mileage','Cylinder']]
x.head()

model = KMeans(n_clusters = 5)
model

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=5, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

model = model.fit(x)

pred = model.predict(x)
pred

array([0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0,
       2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 1,
       1, 0, 0, 0, 0, 4, 4, 4, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 4,
       4, 4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4,
       4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0,
       0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2,
       2, 2, 1, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 1, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 3, 3, 3, 3, 3,
       4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 2, 2, 2, 2, 1, 1,
       1, 1, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 4, 2, 1, 1, 1, 1, 1, 0,
       0, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 0, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1,
       0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 0, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0, 2, 2,
       2, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 2, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 0, 0, 2, 2, 2,
       2, 2, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4, 4, 4, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0,
       0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 1, 0, 4, 4, 4,
       4, 4, 4, 1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4,
       4, 1, 1, 3, 3, 3, 4, 4, 4, 4, 4, 4, 1, 3, 3, 4, 4, 4, 4, 4, 4, 4, 1,
       3, 3, 3, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4,
       4, 4, 4, 4, 4, 1, 1, 0, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 1, 0, 4, 2, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1])

x['new cluster'] = pred
x.head()

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\ipykernel_launcher.py:1: SettingWithCopyWarning:
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.

The silhouette value is a measure of how similar an object is to its own cluster (cohesion) compared to other clusters (separation). The silhouette ranges from -1 to 1, where a high value indicates that the object is well matched to its own cluster and poorly matched to neighboring clusters. If most objects have a high value, then the clustering configuration is appropriate. If many points have a low or negative value, the the clustering configuration may have too many or too few clusters. The Silhouette Coefficient for a sample is (b-a)/max(a,b)

silhouette_score(x,pred)

0.3544457891439895

# Visualize the data clusters
plt.figure(figsize=(10,6))
plt.scatter(x.Mileage,x.Price,c=pred)
plt.legend()
plt.colorbar()
plt.show()

model.cluster_centers_

array([[  1.84179663e+04,   8.03421642e+03,   5.07462687e+00],
       [  1.63211829e+04,   2.90157322e+04,   5.09289617e+00],
       [  1.58318151e+04,   1.99361786e+04,   4.97142857e+00],
       [  4.34180038e+04,   1.18960317e+04,   7.07936508e+00],
       [  3.15059802e+04,   2.24085486e+04,   5.45833333e+00]])

model.labels_

array([0, 0, 0, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0,
       2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2,
       1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 1,
       1, 0, 0, 0, 0, 4, 4, 4, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 3, 4,
       4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 4,
       4, 4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4,
       4, 4, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 0,
       0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2,
       2, 2, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2,
       2, 2, 1, 0, 2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0,
       2, 2, 2, 2, 2, 2, 1, 0, 0, 2, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2,
       1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 2, 1, 1,
       1, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0,
       2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1, 3, 3, 3, 3, 3,
       4, 4, 4, 4, 4, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 0, 0, 2, 2, 2, 2, 1, 1,
       1, 1, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 0, 0, 0, 4, 2, 1, 1, 1, 1, 1, 0,
       0, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 2, 2, 2, 2, 1, 1, 1, 1, 0, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 2, 2,
       2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 1, 1, 0, 0, 0, 0, 2, 2, 2, 1, 1, 1,
       0, 0, 0, 2, 2, 2, 2, 2, 1, 1, 0, 0, 2, 2, 2, 1, 1, 1, 1, 1, 0, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0, 0, 0, 0, 2, 2,
       2, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 1, 1, 1, 0, 0, 0, 2, 2, 2, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2,
       2, 2, 1, 1, 1, 1, 1, 1, 0, 0, 0, 2, 2, 2, 1, 1, 1, 1, 0, 0, 2, 2, 2,
       2, 2, 1, 1, 1, 3, 3, 4, 4, 4, 4, 4, 4, 4, 1, 2, 2, 2, 2, 2, 2, 2, 2,
       1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 0, 2, 2, 2, 2, 2, 2, 1, 1, 1, 0,
       0, 0, 0, 0, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 1, 0, 4, 4, 4,
       4, 4, 4, 1, 1, 1, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 4, 4, 4, 4, 4,
       4, 1, 1, 3, 3, 3, 4, 4, 4, 4, 4, 4, 1, 3, 3, 4, 4, 4, 4, 4, 4, 4, 1,
       3, 3, 3, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 4, 4,
       4, 4, 4, 4, 4, 1, 1, 0, 4, 4, 4, 4, 4, 4, 4, 4, 1, 4, 4, 4, 4, 4, 4,
       4, 4, 4, 1, 0, 4, 2, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 1, 1])

# Creating a loop to loop through a range of clusters and see which number of clusters gives us the best silhouette score,
# thus giving us the best model

silScores = []

def clust(clusters):
    bestClustNum = 0
    silScore = 0
    for num_of_cluster in np.arange(1,clusters):
        num_of_cluster += 1
        model = KMeans(n_clusters=num_of_cluster)
        model = model.fit(x)
        pred = model.predict(x)
        score = silhouette_score(x,pred)
        print 'Number of Clusters {}, silhouette score {}'.format(num_of_cluster,score)

        if score > silScore:
            silScore = score
            bestClustNum = num_of_cluster

        silScores.append(score)

    print 'The best number of Clusters is {} with a silhouette score of {}'.format(bestClustNum,silScore)

clust(20)
silScores

Number of Clusters 2, silhouette score 0.451145420319
Number of Clusters 3, silhouette score 0.413210865832
Number of Clusters 4, silhouette score 0.413286093659
Number of Clusters 5, silhouette score 0.355218347953
Number of Clusters 6, silhouette score 0.363436162479
Number of Clusters 7, silhouette score 0.357516123854
Number of Clusters 8, silhouette score 0.368463252368
Number of Clusters 9, silhouette score 0.349115959965
Number of Clusters 10, silhouette score 0.355018959202
Number of Clusters 11, silhouette score 0.358482151876
Number of Clusters 12, silhouette score 0.36523064763
Number of Clusters 13, silhouette score 0.353496211736
Number of Clusters 14, silhouette score 0.35858477856
Number of Clusters 15, silhouette score 0.347898689048
Number of Clusters 16, silhouette score 0.356955236674
Number of Clusters 17, silhouette score 0.357704782134
Number of Clusters 18, silhouette score 0.356481500488
Number of Clusters 19, silhouette score 0.352767719712
Number of Clusters 20, silhouette score 0.348674348196
The best number of Clusters is 2 with a silhouette score of 0.451145420319

[0.4511454203189893,
 0.41321086583208377,
 0.4132860936585307,
 0.35521834795309348,
 0.36343616247896399,
 0.3575161238544769,
 0.36846325236832911,
 0.34911595996509309,
 0.35501895920207749,
 0.35848215187637755,
 0.36523064763013019,
 0.35349621173628853,
 0.35858477855967297,
 0.34789868904830923,
 0.35695523667367018,
 0.35770478213399731,
 0.35648150048841337,
 0.35276771971226706,
 0.3486743481958568]

Classification Algorithm¶

k nearest neighbors calculates distance from points to groups of the points closest to it and determines which quality it is most like. It is highly recommended to scale or normalize your data when using K-nearest neighbors so some values do not have significantly higher weight than others.

data = pd.read_csv('heightweight.csv')
data_weight = data[['weightLb','ageYear']]
data_weight.head()

from sklearn.preprocessing import scale
data_scaled = scale(data_weight)
data_scaled

array([[-0.840635  , -1.16552514],
       [ 0.19031193, -0.51288498],
       [ 0.34495397, -0.62166052],
       [-0.47980357, -0.1865649 ],
       [ 0.57691704,  1.4450355 ],
       [ 0.55114336,  0.35729973],
       [ 0.13876459,  1.11871542],
       [-1.66539255, -1.21991618],
       [-0.35093521, -0.24095593],
       [-1.69116622, -1.32868518],
       [ 0.13876459, -1.38307622],
       [ 0.11299091,  0.73800431],
       [ 1.14393785, -0.40411597],
       [-0.42825623, -0.83920506],
       [-2.61901847, -1.16552514],
       [-0.63444561, -1.05675613],
       [ 0.29340663,  1.4450355 ],
       [-1.17569276, -0.78482056],
       [ 0.7057854 , -0.9479806 ],
       [ 0.65423806,  0.84677985],
       [-0.840635  , -1.27430068],
       [-1.04682439, -1.32868518],
       [-0.91795602, -0.02340486],
       [ 0.55114336,  0.6292353 ],
       [-0.01587745,  1.11871542],
       [ 0.11299091,  0.08536415],
       [-0.40248255,  0.57484427],
       [ 0.55114336,  0.84677985],
       [ 1.99446907,  2.47838025],
       [-0.91795602, -1.0023651 ],
       [-0.58289827,  0.30291522],
       [-0.89218235, -0.1321804 ],
       [ 0.47382234, -0.83920506],
       [-0.27361419, -1.38307622],
       [-0.32516153,  1.17309993],
       [ 1.01506948,  1.77135559],
       [-0.09319847,  0.24852419],
       [ 2.12333744,  0.68361981],
       [ 0.86042744,  1.11871542],
       [ 0.16453826,  0.95555538],
       [ 0.06144357,  0.46607526],
       [-0.60867194,  0.08536415],
       [-0.32516153,  0.19413968],
       [-0.14474582,  0.24852419],
       [-0.37670888, -0.78482056],
       [ 0.34495397,  1.06432439],
       [-1.94890296, -1.38307622],
       [-0.86640868, -0.9479806 ],
       [-0.40248255, -1.11114064],
       [ 0.55114336,  0.68361981],
       [ 2.43262152,  0.73800431],
       [ 0.55114336,  1.77135559],
       [ 0.39650132, -1.0023651 ],
       [-0.50557725, -1.05675613],
       [-1.35610847, -0.9479806 ],
       [-0.89218235, -1.05675613],
       [ 0.29340663, -0.51288498],
       [-0.4540299 ,  0.13975518],
       [ 0.422275  ,  1.00993989],
       [-0.89218235, -1.16552514],
       [ 0.06144357,  1.00993989],
       [ 0.24185928,  1.11871542],
       [-1.25301378, -0.89359609],
       [ 0.52536969, -0.9479806 ],
       [ 0.65423806, -0.56727601],
       [-1.35610847, -0.45850048],
       [-1.43342949, -1.11114064],
       [-0.40248255, -0.56727601],
       [ 0.19031193, -0.67604502],
       [ 0.62846438,  1.4450355 ],
       [ 1.99446907,  1.39064447],
       [-1.25301378, -1.32868518],
       [-0.86640868, -0.89359609],
       [ 0.62846438,  1.33625997],
       [-1.2272401 , -1.16552514],
       [ 0.83465377,  0.73800431],
       [-0.17051949, -0.02340486],
       [ 0.55114336, -0.40411597],
       [-0.01587745, -0.9479806 ],
       [-0.32516153, -0.89359609],
       [-1.04682439,  0.68361981],
       [-0.53135092,  0.35729973],
       [ 2.09756377,  0.41168423],
       [-0.14474582,  1.39064447],
       [ 0.55114336,  1.00993989],
       [ 0.78310642, -1.16552514],
       [-0.14474582,  0.79239534],
       [-0.91795602,  1.17309993],
       [ 1.63363765,  0.95555538],
       [-0.50557725,  0.95555538],
       [-1.48497684, -1.21991618],
       [ 0.26763295,  0.03097964],
       [-1.76848724,  0.03097964],
       [ 1.0923905 , -0.56727601],
       [-1.40765582, -0.78482056],
       [ 2.22643213, -0.51288498],
       [-0.89218235, -0.07779589],
       [-1.48497684, -1.27430068],
       [-1.92312928, -0.9479806 ],
       [ 0.75733275,  2.47838025],
       [-0.89218235,  0.35729973],
       [-0.40248255,  0.13975518],
       [ 0.52536969,  0.95555538],
       [-0.47980357, -1.11114064],
       [ 0.7057854 ,  1.55380451],
       [-0.840635  , -1.27430068],
       [ 0.34495397, -0.02340486],
       [ 0.34495397,  1.17309993],
       [-0.840635  ,  0.24852419],
       [-0.78908765,  0.57484427],
       [ 0.47382234,  0.84677985],
       [-0.17051949,  0.03097964],
       [ 0.19031193, -0.40411597],
       [-1.27878745, -1.11114064],
       [-0.89218235, -0.78482056],
       [ 1.37590091, -0.78482056],
       [-0.73754031, -1.38307622],
       [ 1.37590091,  1.33625997],
       [ 0.49959602,  1.00993989],
       [-1.14991908, -0.9479806 ],
       [-0.58289827, -1.0023651 ],
       [-0.89218235, -0.24095593],
       [ 0.55114336, -0.45850048],
       [-0.42825623,  0.46607526],
       [ 0.8088801 , -0.73043605],
       [-0.89218235, -1.27430068],
       [-0.09319847, -0.78482056],
       [-0.32516153, -0.02340486],
       [-0.89218235, -0.62166052],
       [ 1.68518499,  2.26083571],
       [ 3.6182105 ,  4.65384529],
       [-0.14474582,  0.6292353 ],
       [ 0.88620112,  0.6292353 ],
       [-0.35093521, -1.32868518],
       [ 0.19031193,  1.11871542],
       [ 0.13876459,  0.84677985],
       [-0.9437297 , -1.0023651 ],
       [ 0.21608561,  1.00993989],
       [-0.89218235, -1.32868518],
       [-0.78908765, -0.73043605],
       [-1.04682439, -0.73043605],
       [-0.37670888, -1.11114064],
       [-1.17569276, -0.24095593],
       [ 0.93774846,  0.73800431],
       [ 1.63363765,  1.55380451],
       [ 0.91197479, -0.1321804 ],
       [-0.32516153, -0.02340486],
       [ 0.55114336,  1.17309993],
       [-1.35610847, -1.16552514],
       [-0.47980357,  0.57484427],
       [ 0.55114336,  0.57484427],
       [-0.14474582,  0.57484427],
       [ 0.57691704,  0.46607526],
       [ 0.57691704,  0.30291522],
       [ 0.34495397,  0.52045977],
       [ 0.34495397, -0.02340486],
       [-0.68599296, -1.11114064],
       [ 0.24185928, -0.45850048],
       [-0.47980357, -0.83920506],
       [ 0.83465377, -1.11114064],
       [-0.89218235, -0.9479806 ],
       [ 0.55114336,  1.28187546],
       [-0.0674248 ,  0.24852419],
       [ 0.55114336,  0.41168423],
       [-0.89218235, -0.78482056],
       [ 1.35012724,  1.55380451],
       [-1.07259806, -0.40411597],
       [-0.40248255,  0.19413968],
       [-0.76331398, -1.32868518],
       [-0.4540299 , -0.45850048],
       [ 0.37072765, -0.45850048],
       [ 1.01506948, -0.34972494],
       [ 0.55114336,  1.06432439],
       [ 0.65423806, -0.45850048],
       [-0.89218235, -1.11114064],
       [-1.04682439,  0.6292353 ],
       [ 0.52536969,  0.19413968],
       [-1.04682439, -0.83920506],
       [-1.6138452 , -1.21991618],
       [ 1.99446907,  1.28187546],
       [ 0.8088801 ,  2.09767567],
       [-0.89218235, -1.21991618],
       [ 0.55114336,  1.33625997],
       [ 2.53571621,  1.28187546],
       [ 2.3553005 ,  1.93451563],
       [ 0.19031193, -0.67604502],
       [ 0.93774846,  0.52045977],
       [-0.89218235,  0.08536415],
       [-0.53135092, -1.05675613],
       [-0.01587745, -1.16552514],
       [ 0.83465377, -0.07779589],
       [ 1.01506948,  0.08536415],
       [ 1.63363765,  0.95555538],
       [ 0.55114336,  0.46607526],
       [-0.50557725, -0.51288498],
       [ 0.19031193, -0.1321804 ],
       [ 0.49959602,  0.68361981],
       [ 0.55114336,  0.68361981],
       [ 0.65423806,  0.57484427],
       [-0.53135092,  0.08536415],
       [-0.17051949, -0.78482056],
       [ 0.86042744, -0.78482056],
       [ 0.73155908,  1.28187546],
       [ 0.55114336, -0.07779589],
       [ 0.55114336,  0.35729973],
       [-0.53135092, -0.1321804 ],
       [-0.840635  , -1.27430068],
       [ 0.55114336,  0.52045977],
       [-0.71176663, -1.21991618],
       [ 0.86042744, -0.89359609],
       [-0.91795602, -1.32868518],
       [ 0.75733275, -0.24095593],
       [-0.63444561, -1.11114064],
       [ 3.6182105 ,  2.26083571],
       [ 0.55114336, -0.29534044],
       [-1.51075051, -0.83920506],
       [ 2.50994254,  1.55380451],
       [ 1.71095867,  1.60819555],
       [-0.22206684, -0.67604502],
       [-1.53652418, -1.0023651 ],
       [-1.43342949, -1.38307622],
       [ 0.55114336,  1.17309993],
       [-1.35610847, -0.1865649 ],
       [ 1.37590091, -0.62166052],
       [-0.17051949,  1.71696455],
       [-0.89218235, -0.02340486],
       [-0.11897215, -0.29534044],
       [ 0.55114336,  0.73800431],
       [-1.12414541, -0.62166052],
       [-1.07259806, -0.51288498],
       [ 0.06144357,  0.73800431],
       [-1.30456112, -1.21991618],
       [ 0.55114336, -0.02340486],
       [ 0.65423806,  1.33625997],
       [ 1.99446907, -0.02340486],
       [ 0.3191803 ,  0.13975518],
       [-0.73754031, -0.73043605]])

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=25)
model = neigh.fit(data_weight,data.sex)

print "Predicted Gender = ",model.predict([[110,19.0]])

Predicted Gender =  ['f']

Naive Bayes¶

Naive because we assume that all conditions are independent of each other. This assumption makes the math easier and the algorithm faster. However, this assumption is generally not true. Top uses for this algorithm are spam filters and document classification.

import os
import io
import numpy as np
from pandas import DataFrame
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

!cd

C:\Users\nwerner\DevMasters\Class Notes\Day #5 - Classifier Algorithms

filenames = []
for files in os.walk('C:\\Users\\nwerner\\DevMasters\\Class Notes\\Day #5 - Classifier Algorithms'):
    filenames.append(files)

filenames

[('C:\\Users\\nwerner\\DevMasters\\Class Notes\\Day #5 - Classifier Algorithms',
  ['.ipynb_checkpoints', 'Emails'],
  ['.DS_Store',
   'cars.xls',
   'Day #5 - Classification Algorithms.ipynb',
   'heightweight.csv',
   'PastHires.csv']),
 ('C:\\Users\\nwerner\\DevMasters\\Class Notes\\Day #5 - Classifier Algorithms\\.ipynb_checkpoints',
  [],
  ['Day #5 - Class Notes-checkpoint.ipynb',
   'Day #5 - Classification Algorithms-checkpoint.ipynb',
   'Day #6 - Kaggle Titanic Competition-checkpoint.ipynb']),
 ('C:\\Users\\nwerner\\DevMasters\\Class Notes\\Day #5 - Classifier Algorithms\\Emails',
  ['ham', 'spam'],
  ['.DS_Store']),
 ('C:\\Users\\nwerner\\DevMasters\\Class Notes\\Day #5 - Classifier Algorithms\\Emails\\ham',
  [],
  ['.DS_Store', 'Ford.txt']),
 ('C:\\Users\\nwerner\\DevMasters\\Class Notes\\Day #5 - Classifier Algorithms\\Emails\\spam',
  [],
  ['.DS_Store', 'FreeBeer.txt', 'Viagra.txt'])]

print os.path

<module 'ntpath' from 'C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\ntpath.pyc'>

loca = 'C:\\Users\\nwerner\\AppData\\Local\\Continuum\\Anaconda2\\lib\\ntpath.pyc'

print os.path.split(loca)
print os.path.splitdrive(loca)
print os.path.dirname(loca)
print os.path.basename(loca)

('C:\\Users\\nwerner\\AppData\\Local\\Continuum\\Anaconda2\\lib', 'ntpath.pyc')
('C:', '\\Users\\nwerner\\AppData\\Local\\Continuum\\Anaconda2\\lib\\ntpath.pyc')
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib
ntpath.pyc

def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                lines.append(line)
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message':[],'class':[]})
data = data.append(dataFrameFromDirectory('C:\\Users\\nwerner\\DevMasters\\Class Notes\\Day #5 - Classifier Algorithms\\Emails\\spam','spam'))
data = data.append(dataFrameFromDirectory('C:\\Users\\nwerner\\DevMasters\\Class Notes\\Day #5 - Classifier Algorithms\\Emails\\ham','ham'))
data

vectorizer = CountVectorizer()

# this will tokenize your data. It will take all message values and turn them into numeric values
# a word occurs and splits them into an array

counts = vectorizer.fit_transform(data['message'].values)

print counts

  (0, 2)	1
  (0, 1)	1
  (1, 4)	1
  (2, 9)	1
  (2, 8)	1
  (2, 3)	1
  (2, 11)	1
  (2, 10)	1
  (2, 4)	1
  (3, 2)	1
  (3, 1)	1
  (4, 5)	1
  (4, 6)	1
  (4, 0)	1
  (4, 7)	1
  (4, 12)	1

counts.todense()

matrix([[0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0],
        [0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1]], dtype=int64)

classifier = MultinomialNB()

# targets is tkhe classification of every email you have encountered
targets = data['class'].values

# this will create a model using naive bayes
classifier.fit(counts,targets)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

examples = ['Free Weed and Viagra now!!!','Mustang','Free Viagra']
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham', 'spam'],
      dtype='|S4')

Decision Trees¶

import numpy as np
import pandas as pd
from sklearn import tree

input_file = 'PastHires.csv'
df = pd.read_csv(input_file,header=0)
df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13 entries, 0 to 12
Data columns (total 7 columns):
Years Experience      13 non-null int64
Employed?             13 non-null object
Previous employers    13 non-null int64
Level of Education    13 non-null object
Top-tier school       13 non-null object
Interned              13 non-null object
Hired                 13 non-null object
dtypes: int64(2), object(5)
memory usage: 800.0+ bytes

d = {'Y':1,'N':0,'BS':0,'MS':1,'PhD':2}

df['Hired'] = df['Hired'].map(d)
df['Employed?'] = df['Employed?'].map(d)
df['Top-tier school'] = df['Top-tier school'].map(d)
df['Interned'] = df['Interned'].map(d)
df['Level of Education'] = df['Level of Education'].map(d)
df

features = list(df.columns[:6])
y = df['Hired']
X = df[features]
clf = tree.DecisionTreeClassifier()
clf = clf.fit(X,y)
clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

print clf.predict([0,1,0,0,0,0]) # Just need a job to get hired
print clf.predict([0,0,0,0,0,1]) # Just need to have interned with the company to get hired

[1]
[1]

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

Ensemble Learning: Using a Random Forest¶

from sklearn.ensemble import RandomForestClassifier

# How many trees do you want in your forest?
clf = RandomForestClassifier(n_estimators=20)
clf = clf.fit(X,y)
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=20, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)

Ensemble Learning: Using Gradient Boosting¶

from sklearn.ensemble import GradientBoostingClassifier

# How many trees do you want in your forest?
clf = GradientBoostingClassifier(n_estimators=10)
clf = clf.fit(X,y)
print clf.predict([10,1,4,0,0,0])
print clf.predict([10,0,4,2,0,0])
print clf.predict([0,1,0,0,0,0]) # Just need a job to get hired
print clf.predict([0,0,0,0,0,1]) # Just need to have interned with the company to get hired

[1]
[0]
[1]
[1]

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

prediction = clf.predict(X)
prediction

array([1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1], dtype=int64)

from sklearn.metrics import accuracy_score, f1_score
accuracy_score(y,prediction)

1.0

Logistic Regression¶

from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
logmodel = lr.fit(X,y)

logmodel.predict([10,0,3,1,1,1])
logmodel.predict([45,0,1,2,1,1])

C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\utils\validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)

array([0], dtype=int64)

	Price	Mileage	Make	Model	Trim	Type	Cylinder	Liter	Doors	Cruise	Sound	Leather
0	17314.103129	8221	Buick	Century	Sedan 4D	Sedan	6	3.1	4	1	1	1
1	17542.036083	9135	Buick	Century	Sedan 4D	Sedan	6	3.1	4	1	1	0
2	16218.847862	13196	Buick	Century	Sedan 4D	Sedan	6	3.1	4	1	1	0
3	16336.913140	16342	Buick	Century	Sedan 4D	Sedan	6	3.1	4	1	0	0
4	16339.170324	19832	Buick	Century	Sedan 4D	Sedan	6	3.1	4	1	0	1

	Price	Mileage	Cylinder
0	17314.103129	8221	6
1	17542.036083	9135	6
2	16218.847862	13196	6
3	16336.913140	16342	6
4	16339.170324	19832	6

	Price	Mileage	Cylinder	new cluster
0	17314.103129	8221	6	0
1	17542.036083	9135	6	0
2	16218.847862	13196	6	0
3	16336.913140	16342	6	2
4	16339.170324	19832	6	2

	weightLb	ageYear
0	85.0	11.91667
1	105.0	12.91667
2	108.0	12.75000
3	92.0	13.41667
4	112.5	15.91667

	class	message
C:\Users\nwerner\DevMasters\Class Notes\Day #5 - Classifier Algorithms\Emails\spam\.DS_Store	spam	Bud1%...
C:\Users\nwerner\DevMasters\Class Notes\Day #5 - Classifier Algorithms\Emails\spam\FreeBeer.txt	spam	Free
C:\Users\nwerner\DevMasters\Class Notes\Day #5 - Classifier Algorithms\Emails\spam\Viagra.txt	spam	Try Viagra for free now!! spam
C:\Users\nwerner\DevMasters\Class Notes\Day #5 - Classifier Algorithms\Emails\ham\.DS_Store	ham	Bud1%...
C:\Users\nwerner\DevMasters\Class Notes\Day #5 - Classifier Algorithms\Emails\ham\Ford.txt	ham	Win a new 2016 Mustang!!! ham

	Years Experience	Employed?	Previous employers	Level of Education	Top-tier school	Interned	Hired
0	10	1	4	0	0	0	1
1	0	0	0	0	1	1	1
2	7	0	6	0	0	0	0
3	2	1	1	1	1	0	1
4	20	0	2	2	1	0	0
5	0	0	0	2	1	1	1
6	5	1	2	1	0	1	1
7	3	0	1	0	0	1	1
8	15	1	5	0	0	0	1
9	0	0	0	0	0	0	0
10	1	0	1	2	1	0	0
11	4	1	1	0	0	1	1
12	0	0	0	2	1	0	1

	Years Experience	Employed?	Previous employers	Level of Education	Top-tier school	Interned	Hired
0	10	1	4	0	0	0	1
1	0	0	0	0	1	1	1
2	7	0	6	0	0	0	0
3	2	1	1	1	1	0	1
4	20	0	2	2	1	0	0
5	0	0	0	2	1	1	1
6	5	1	2	1	0	1	1
7	3	0	1	0	0	1	1
8	15	1	5	0	0	0	1
9	0	0	0	0	0	0	0
10	1	0	1	2	1	0	0
11	4	1	1	0	0	1	1
12	0	0	0	2	1	0	1

Categorical Variable Algorithms - Class Notes