#import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
ratings = pd.read_csv('u.data',header=None,sep='\t') #'\t' = separate on the slash and then tab
ratings.head()
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | 0 | 50 | 5 | 881250949 |
| 1 | 0 | 172 | 5 | 881250949 |
| 2 | 0 | 133 | 1 | 881250949 |
| 3 | 196 | 242 | 3 | 881250949 |
| 4 | 186 | 302 | 3 | 891717742 |
r_cols = ['user_id','movie_id','rating']
ratings = pd.read_csv('u.data',sep='\t',names=r_cols,usecols=range(3))
print ratings.head()
m_cols=['movie_id','title']
movies = pd.read_csv('u.item',sep='|',names=m_cols,usecols=range(2))
print movies.head()
user_id movie_id rating 0 0 50 5 1 0 172 5 2 0 133 1 3 196 242 3 4 186 302 3 movie_id title 0 1 Toy Story (1995) 1 2 GoldenEye (1995) 2 3 Four Rooms (1995) 3 4 Get Shorty (1995) 4 5 Copycat (1995)
# Merging the dataframes
df = pd.merge(ratings,movies,on='movie_id')
df.head()
#df['movie_id'].unique()
| user_id | movie_id | rating | title | |
|---|---|---|---|---|
| 0 | 0 | 50 | 5 | Star Wars (1977) |
| 1 | 290 | 50 | 5 | Star Wars (1977) |
| 2 | 79 | 50 | 4 | Star Wars (1977) |
| 3 | 2 | 50 | 5 | Star Wars (1977) |
| 4 | 8 | 50 | 5 | Star Wars (1977) |
movieRatings = df.pivot_table(index=['user_id'],columns=['title'],values='rating')
movieRatings.head()
| title | 'Til There Was You (1997) | 1-900 (1994) | 101 Dalmatians (1996) | 12 Angry Men (1957) | 187 (1997) | 2 Days in the Valley (1996) | 20,000 Leagues Under the Sea (1954) | 2001: A Space Odyssey (1968) | 3 Ninjas: High Noon At Mega Mountain (1998) | 39 Steps, The (1935) | ... | Yankee Zulu (1994) | Year of the Horse (1997) | You So Crazy (1994) | Young Frankenstein (1974) | Young Guns (1988) | Young Guns II (1990) | Young Poisoner's Handbook, The (1995) | Zeus and Roxanne (1997) | unknown | � k�ldum klaka (Cold Fever) (1994) |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| user_id | |||||||||||||||||||||
| 0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 1 | NaN | NaN | 2.0 | 5.0 | NaN | NaN | 3.0 | 4.0 | NaN | NaN | ... | NaN | NaN | NaN | 5.0 | 3.0 | NaN | NaN | NaN | 4.0 | NaN |
| 2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 1664 columns
starWarsRatings = movieRatings['Star Wars (1977)']
starWarsRatings.head(20)
user_id 0 5.0 1 5.0 2 5.0 3 NaN 4 5.0 5 4.0 6 4.0 7 5.0 8 5.0 9 5.0 10 5.0 11 NaN 12 4.0 13 5.0 14 5.0 15 5.0 16 NaN 17 NaN 18 4.0 19 NaN Name: Star Wars (1977), dtype: float64
starWarsRatings.value_counts(dropna=False)
NaN 360 5.0 326 4.0 176 3.0 57 2.0 16 1.0 9 Name: Star Wars (1977), dtype: int64
movieRatings[['101 Dalmatians (1996)','Star Wars (1977)']].corr()
| title | 101 Dalmatians (1996) | Star Wars (1977) |
|---|---|---|
| title | ||
| 101 Dalmatians (1996) | 1.000000 | 0.211132 |
| Star Wars (1977) | 0.211132 | 1.000000 |
# corrwith calculates the correlation between 1 feature with the rest of the features
movieRatings.corrwith(starWarsRatings)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\numpy\lib\function_base.py:2995: RuntimeWarning: Degrees of freedom <= 0 for slice c = cov(x, y, rowvar) C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\numpy\lib\function_base.py:2929: RuntimeWarning: divide by zero encountered in double_scalars c *= 1. / np.float64(fact)
title
'Til There Was You (1997) 0.872872
1-900 (1994) -0.645497
101 Dalmatians (1996) 0.211132
12 Angry Men (1957) 0.184289
187 (1997) 0.027398
2 Days in the Valley (1996) 0.066654
20,000 Leagues Under the Sea (1954) 0.289768
2001: A Space Odyssey (1968) 0.230884
3 Ninjas: High Noon At Mega Mountain (1998) NaN
39 Steps, The (1935) 0.106453
8 1/2 (1963) -0.142977
8 Heads in a Duffel Bag (1997) -0.577350
8 Seconds (1994) -0.755929
A Chef in Love (1996) 0.868599
Above the Rim (1994) -0.645497
Absolute Power (1997) 0.085440
Abyss, The (1989) 0.203709
Ace Ventura: Pet Detective (1994) 0.062689
Ace Ventura: When Nature Calls (1995) 0.094514
Across the Sea of Time (1995) -0.132453
Addams Family Values (1993) 0.132264
Addicted to Love (1997) 0.028173
Addiction, The (1995) 0.507093
Adventures of Pinocchio, The (1996) 0.111616
Adventures of Priscilla, Queen of the Desert, The (1994) 0.054740
Adventures of Robin Hood, The (1938) 0.144587
Affair to Remember, An (1957) 0.225451
African Queen, The (1951) 0.230540
Afterglow (1997) 0.420084
Age of Innocence, The (1993) -0.037176
...
Window to Paris (1994) NaN
Wings of Courage (1995) NaN
Wings of Desire (1987) -0.104771
Wings of the Dove, The (1997) 0.000000
Winnie the Pooh and the Blustery Day (1968) 0.142924
Winter Guest, The (1997) 0.353553
Wishmaster (1997) -0.066932
With Honors (1994) -0.111205
Withnail and I (1987) 0.083045
Witness (1985) NaN
Wizard of Oz, The (1939) 0.266335
Wolf (1994) 0.113214
Woman in Question, The (1950) NaN
Women, The (1939) -0.175608
Wonderful, Horrible Life of Leni Riefenstahl, The (1993) 0.707107
Wonderland (1997) NaN
Wooden Man's Bride, The (Wu Kui) (1994) NaN
World of Apu, The (Apur Sansar) (1959) -0.522233
Wrong Trousers, The (1993) 0.216204
Wyatt Earp (1994) 0.059560
Yankee Zulu (1994) NaN
Year of the Horse (1997) -1.000000
You So Crazy (1994) NaN
Young Frankenstein (1974) 0.192589
Young Guns (1988) 0.186377
Young Guns II (1990) 0.228615
Young Poisoner's Handbook, The (1995) -0.007374
Zeus and Roxanne (1997) 0.818182
unknown 0.723123
� k�ldum klaka (Cold Fever) (1994) NaN
Length: 1664, dtype: float64similarMovies = movieRatings.corrwith(starWarsRatings)
#print similarRatings
similarMovies = similarMovies.dropna()
similarMovies
title
'Til There Was You (1997) 0.872872
1-900 (1994) -0.645497
101 Dalmatians (1996) 0.211132
12 Angry Men (1957) 0.184289
187 (1997) 0.027398
2 Days in the Valley (1996) 0.066654
20,000 Leagues Under the Sea (1954) 0.289768
2001: A Space Odyssey (1968) 0.230884
39 Steps, The (1935) 0.106453
8 1/2 (1963) -0.142977
8 Heads in a Duffel Bag (1997) -0.577350
8 Seconds (1994) -0.755929
A Chef in Love (1996) 0.868599
Above the Rim (1994) -0.645497
Absolute Power (1997) 0.085440
Abyss, The (1989) 0.203709
Ace Ventura: Pet Detective (1994) 0.062689
Ace Ventura: When Nature Calls (1995) 0.094514
Across the Sea of Time (1995) -0.132453
Addams Family Values (1993) 0.132264
Addicted to Love (1997) 0.028173
Addiction, The (1995) 0.507093
Adventures of Pinocchio, The (1996) 0.111616
Adventures of Priscilla, Queen of the Desert, The (1994) 0.054740
Adventures of Robin Hood, The (1938) 0.144587
Affair to Remember, An (1957) 0.225451
African Queen, The (1951) 0.230540
Afterglow (1997) 0.420084
Age of Innocence, The (1993) -0.037176
Air Bud (1997) 0.012557
...
Whole Wide World, The (1996) 0.000000
Widows' Peak (1994) 0.136743
Wild America (1997) 0.040656
Wild Bill (1995) -0.286299
Wild Bunch, The (1969) -0.041299
Wild Reeds (1994) -0.377964
Wild Things (1998) -0.259437
William Shakespeare's Romeo and Juliet (1996) 0.098861
Willy Wonka and the Chocolate Factory (1971) 0.221902
Wings of Desire (1987) -0.104771
Wings of the Dove, The (1997) 0.000000
Winnie the Pooh and the Blustery Day (1968) 0.142924
Winter Guest, The (1997) 0.353553
Wishmaster (1997) -0.066932
With Honors (1994) -0.111205
Withnail and I (1987) 0.083045
Wizard of Oz, The (1939) 0.266335
Wolf (1994) 0.113214
Women, The (1939) -0.175608
Wonderful, Horrible Life of Leni Riefenstahl, The (1993) 0.707107
World of Apu, The (Apur Sansar) (1959) -0.522233
Wrong Trousers, The (1993) 0.216204
Wyatt Earp (1994) 0.059560
Year of the Horse (1997) -1.000000
Young Frankenstein (1974) 0.192589
Young Guns (1988) 0.186377
Young Guns II (1990) 0.228615
Young Poisoner's Handbook, The (1995) -0.007374
Zeus and Roxanne (1997) 0.818182
unknown 0.723123
Length: 1410, dtype: float64similarMovies.sort_values(ascending=False)
title
No Escape (1994) 1.000000
Man of the Year (1995) 1.000000
Hollow Reed (1996) 1.000000
Commandments (1997) 1.000000
Cosi (1996) 1.000000
Stripes (1981) 1.000000
Golden Earrings (1947) 1.000000
Mondo (1996) 1.000000
Line King: Al Hirschfeld, The (1996) 1.000000
Outlaw, The (1943) 1.000000
Hurricane Streets (1998) 1.000000
Scarlet Letter, The (1926) 1.000000
Safe Passage (1994) 1.000000
Good Man in Africa, A (1994) 1.000000
Full Speed (1996) 1.000000
Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991) 1.000000
Star Wars (1977) 1.000000
Ed's Next Move (1996) 1.000000
Twisted (1996) 1.000000
Beans of Egypt, Maine, The (1994) 1.000000
Last Time I Saw Paris, The (1954) 1.000000
Maya Lin: A Strong Clear Vision (1994) 1.000000
Designated Mourner, The (1997) 0.970725
Albino Alligator (1996) 0.968496
Angel Baby (1995) 0.962250
Prisoner of the Mountains (Kavkazsky Plennik) (1996) 0.927173
Love in the Afternoon (1957) 0.923381
'Til There Was You (1997) 0.872872
A Chef in Love (1996) 0.868599
Guantanamera (1994) 0.866025
...
Pushing Hands (1992) -1.000000
Lamerica (1994) -1.000000
Year of the Horse (1997) -1.000000
Collectionneuse, La (1967) -1.000000
Dream Man (1995) -1.000000
S.F.W. (1994) -1.000000
Nightwatch (1997) -1.000000
Squeeze (1996) -1.000000
Glass Shield, The (1994) -1.000000
Slingshot, The (1993) -1.000000
Lover's Knot (1996) -1.000000
Tough and Deadly (1995) -1.000000
Sliding Doors (1998) -1.000000
Show, The (1995) -1.000000
Nil By Mouth (1997) -1.000000
Fall (1997) -1.000000
Sudden Manhattan (1996) -1.000000
Salut cousin! (1996) -1.000000
Neon Bible, The (1995) -1.000000
Crossfire (1947) -1.000000
Love and Death on Long Island (1997) -1.000000
For Ever Mozart (1996) -1.000000
Swept from the Sea (1997) -1.000000
Fille seule, La (A Single Girl) (1995) -1.000000
American Dream (1990) -1.000000
Theodore Rex (1995) -1.000000
I Like It Like That (1994) -1.000000
Two Deaths (1995) -1.000000
Roseanna's Grave (For Roseanna) (1997) -1.000000
Frankie Starlight (1995) -1.000000
Length: 1410, dtype: float64movieStats = df.groupby('title').agg({'rating':[np.size,np.mean]})
movieStats.head(10)
| rating | ||
|---|---|---|
| size | mean | |
| title | ||
| 'Til There Was You (1997) | 9 | 2.333333 |
| 1-900 (1994) | 5 | 2.600000 |
| 101 Dalmatians (1996) | 109 | 2.908257 |
| 12 Angry Men (1957) | 125 | 4.344000 |
| 187 (1997) | 41 | 3.024390 |
| 2 Days in the Valley (1996) | 93 | 3.225806 |
| 20,000 Leagues Under the Sea (1954) | 72 | 3.500000 |
| 2001: A Space Odyssey (1968) | 259 | 3.969112 |
| 3 Ninjas: High Noon At Mega Mountain (1998) | 5 | 1.000000 |
| 39 Steps, The (1935) | 59 | 4.050847 |
popularMovies = movieStats[movieStats['rating']['size']>=100] #Threshold of 100 is arbitrary
popularMovies.sort_values(('rating','mean'),ascending=False)
| rating | ||
|---|---|---|
| size | mean | |
| title | ||
| Close Shave, A (1995) | 112 | 4.491071 |
| Schindler's List (1993) | 298 | 4.466443 |
| Wrong Trousers, The (1993) | 118 | 4.466102 |
| Casablanca (1942) | 243 | 4.456790 |
| Shawshank Redemption, The (1994) | 283 | 4.445230 |
| Rear Window (1954) | 209 | 4.387560 |
| Usual Suspects, The (1995) | 267 | 4.385768 |
| Star Wars (1977) | 584 | 4.359589 |
| 12 Angry Men (1957) | 125 | 4.344000 |
| Citizen Kane (1941) | 198 | 4.292929 |
| To Kill a Mockingbird (1962) | 219 | 4.292237 |
| One Flew Over the Cuckoo's Nest (1975) | 264 | 4.291667 |
| Silence of the Lambs, The (1991) | 390 | 4.289744 |
| North by Northwest (1959) | 179 | 4.284916 |
| Godfather, The (1972) | 413 | 4.283293 |
| Secrets & Lies (1996) | 162 | 4.265432 |
| Good Will Hunting (1997) | 198 | 4.262626 |
| Manchurian Candidate, The (1962) | 131 | 4.259542 |
| Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) | 194 | 4.252577 |
| Raiders of the Lost Ark (1981) | 420 | 4.252381 |
| Vertigo (1958) | 179 | 4.251397 |
| Titanic (1997) | 350 | 4.245714 |
| Lawrence of Arabia (1962) | 173 | 4.231214 |
| Maltese Falcon, The (1941) | 138 | 4.210145 |
| Empire Strikes Back, The (1980) | 368 | 4.206522 |
| Boot, Das (1981) | 201 | 4.203980 |
| Sling Blade (1996) | 136 | 4.198529 |
| As Good As It Gets (1997) | 112 | 4.196429 |
| Godfather: Part II, The (1974) | 209 | 4.186603 |
| African Queen, The (1951) | 152 | 4.184211 |
| ... | ... | ... |
| Ace Ventura: Pet Detective (1994) | 103 | 3.048544 |
| Conan the Barbarian (1981) | 107 | 3.046729 |
| Grumpier Old Men (1995) | 148 | 3.040541 |
| Star Trek: The Motion Picture (1979) | 117 | 3.034188 |
| Broken Arrow (1996) | 254 | 3.031496 |
| First Wives Club, The (1996) | 160 | 3.018750 |
| Net, The (1995) | 120 | 3.008333 |
| Evita (1996) | 259 | 2.980695 |
| Natural Born Killers (1994) | 128 | 2.953125 |
| Lost World: Jurassic Park, The (1997) | 158 | 2.943038 |
| Dante's Peak (1997) | 240 | 2.933333 |
| Nutty Professor, The (1996) | 163 | 2.914110 |
| 101 Dalmatians (1996) | 109 | 2.908257 |
| Father of the Bride Part II (1995) | 128 | 2.898438 |
| Mars Attacks! (1996) | 217 | 2.847926 |
| Multiplicity (1996) | 134 | 2.835821 |
| Alien 3 (1992) | 100 | 2.830000 |
| Volcano (1997) | 219 | 2.808219 |
| Waterworld (1995) | 102 | 2.803922 |
| Beavis and Butt-head Do America (1996) | 156 | 2.788462 |
| Mimic (1997) | 101 | 2.742574 |
| Down Periscope (1996) | 101 | 2.702970 |
| George of the Jungle (1997) | 162 | 2.685185 |
| Batman Returns (1992) | 142 | 2.683099 |
| Batman Forever (1995) | 114 | 2.666667 |
| Spawn (1997) | 143 | 2.615385 |
| Event Horizon (1997) | 127 | 2.574803 |
| Crash (1996) | 128 | 2.546875 |
| Jungle2Jungle (1997) | 132 | 2.439394 |
| Cable Guy, The (1996) | 106 | 2.339623 |
338 rows × 2 columns
df = popularMovies.join(pd.DataFrame(similarMovies,columns=['similarity']))
df.head(10)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\reshape\merge.py:551: UserWarning: merging between different levels can give an unintended result (2 levels on the left, 1 on the right) warnings.warn(msg, UserWarning)
| (rating, size) | (rating, mean) | similarity | |
|---|---|---|---|
| title | |||
| 101 Dalmatians (1996) | 109 | 2.908257 | 0.211132 |
| 12 Angry Men (1957) | 125 | 4.344000 | 0.184289 |
| 2001: A Space Odyssey (1968) | 259 | 3.969112 | 0.230884 |
| Absolute Power (1997) | 127 | 3.370079 | 0.085440 |
| Abyss, The (1989) | 151 | 3.589404 | 0.203709 |
| Ace Ventura: Pet Detective (1994) | 103 | 3.048544 | 0.062689 |
| Adventures of Priscilla, Queen of the Desert, The (1994) | 111 | 3.594595 | 0.054740 |
| African Queen, The (1951) | 152 | 4.184211 | 0.230540 |
| Air Force One (1997) | 431 | 3.631090 | 0.113164 |
| Aladdin (1992) | 219 | 3.812785 | 0.191621 |
df.sort_values(['similarity'],ascending=False)[:15]
| (rating, size) | (rating, mean) | similarity | |
|---|---|---|---|
| title | |||
| Star Wars (1977) | 584 | 4.359589 | 1.000000 |
| Empire Strikes Back, The (1980) | 368 | 4.206522 | 0.748353 |
| Return of the Jedi (1983) | 507 | 4.007890 | 0.672556 |
| Raiders of the Lost Ark (1981) | 420 | 4.252381 | 0.536117 |
| Austin Powers: International Man of Mystery (1997) | 130 | 3.246154 | 0.377433 |
| Sting, The (1973) | 241 | 4.058091 | 0.367538 |
| Indiana Jones and the Last Crusade (1989) | 331 | 3.930514 | 0.350107 |
| Pinocchio (1940) | 101 | 3.673267 | 0.347868 |
| Frighteners, The (1996) | 115 | 3.234783 | 0.332729 |
| L.A. Confidential (1997) | 297 | 4.161616 | 0.319065 |
| Wag the Dog (1997) | 137 | 3.510949 | 0.318645 |
| Dumbo (1941) | 123 | 3.495935 | 0.317656 |
| Bridge on the River Kwai, The (1957) | 165 | 4.175758 | 0.316580 |
| Philadelphia Story, The (1940) | 104 | 4.115385 | 0.314272 |
| Miracle on 34th Street (1994) | 101 | 3.722772 | 0.310921 |