#import all necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.cross_validation import train_test_split
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20. "This module will be removed in 0.20.", DeprecationWarning)
ratings = pd.read_csv('u.data',header=None,sep='\t') #'\t' = separate on the slash and then tab
ratings.head()
0 | 1 | 2 | 3 | |
---|---|---|---|---|
0 | 0 | 50 | 5 | 881250949 |
1 | 0 | 172 | 5 | 881250949 |
2 | 0 | 133 | 1 | 881250949 |
3 | 196 | 242 | 3 | 881250949 |
4 | 186 | 302 | 3 | 891717742 |
r_cols = ['user_id','movie_id','rating']
ratings = pd.read_csv('u.data',sep='\t',names=r_cols,usecols=range(3))
print ratings.head()
m_cols=['movie_id','title']
movies = pd.read_csv('u.item',sep='|',names=m_cols,usecols=range(2))
print movies.head()
user_id movie_id rating 0 0 50 5 1 0 172 5 2 0 133 1 3 196 242 3 4 186 302 3 movie_id title 0 1 Toy Story (1995) 1 2 GoldenEye (1995) 2 3 Four Rooms (1995) 3 4 Get Shorty (1995) 4 5 Copycat (1995)
# Merging the dataframes
df = pd.merge(ratings,movies,on='movie_id')
df.head()
#df['movie_id'].unique()
user_id | movie_id | rating | title | |
---|---|---|---|---|
0 | 0 | 50 | 5 | Star Wars (1977) |
1 | 290 | 50 | 5 | Star Wars (1977) |
2 | 79 | 50 | 4 | Star Wars (1977) |
3 | 2 | 50 | 5 | Star Wars (1977) |
4 | 8 | 50 | 5 | Star Wars (1977) |
movieRatings = df.pivot_table(index=['user_id'],columns=['title'],values='rating')
movieRatings.head()
title | 'Til There Was You (1997) | 1-900 (1994) | 101 Dalmatians (1996) | 12 Angry Men (1957) | 187 (1997) | 2 Days in the Valley (1996) | 20,000 Leagues Under the Sea (1954) | 2001: A Space Odyssey (1968) | 3 Ninjas: High Noon At Mega Mountain (1998) | 39 Steps, The (1935) | ... | Yankee Zulu (1994) | Year of the Horse (1997) | You So Crazy (1994) | Young Frankenstein (1974) | Young Guns (1988) | Young Guns II (1990) | Young Poisoner's Handbook, The (1995) | Zeus and Roxanne (1997) | unknown | � k�ldum klaka (Cold Fever) (1994) |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
user_id | |||||||||||||||||||||
0 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | NaN | NaN | 2.0 | 5.0 | NaN | NaN | 3.0 | 4.0 | NaN | NaN | ... | NaN | NaN | NaN | 5.0 | 3.0 | NaN | NaN | NaN | 4.0 | NaN |
2 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | NaN | NaN | NaN | NaN | 2.0 | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 rows × 1664 columns
starWarsRatings = movieRatings['Star Wars (1977)']
starWarsRatings.head(20)
user_id 0 5.0 1 5.0 2 5.0 3 NaN 4 5.0 5 4.0 6 4.0 7 5.0 8 5.0 9 5.0 10 5.0 11 NaN 12 4.0 13 5.0 14 5.0 15 5.0 16 NaN 17 NaN 18 4.0 19 NaN Name: Star Wars (1977), dtype: float64
starWarsRatings.value_counts(dropna=False)
NaN 360 5.0 326 4.0 176 3.0 57 2.0 16 1.0 9 Name: Star Wars (1977), dtype: int64
movieRatings[['101 Dalmatians (1996)','Star Wars (1977)']].corr()
title | 101 Dalmatians (1996) | Star Wars (1977) |
---|---|---|
title | ||
101 Dalmatians (1996) | 1.000000 | 0.211132 |
Star Wars (1977) | 0.211132 | 1.000000 |
# corrwith calculates the correlation between 1 feature with the rest of the features
movieRatings.corrwith(starWarsRatings)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\numpy\lib\function_base.py:2995: RuntimeWarning: Degrees of freedom <= 0 for slice c = cov(x, y, rowvar) C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\numpy\lib\function_base.py:2929: RuntimeWarning: divide by zero encountered in double_scalars c *= 1. / np.float64(fact)
title 'Til There Was You (1997) 0.872872 1-900 (1994) -0.645497 101 Dalmatians (1996) 0.211132 12 Angry Men (1957) 0.184289 187 (1997) 0.027398 2 Days in the Valley (1996) 0.066654 20,000 Leagues Under the Sea (1954) 0.289768 2001: A Space Odyssey (1968) 0.230884 3 Ninjas: High Noon At Mega Mountain (1998) NaN 39 Steps, The (1935) 0.106453 8 1/2 (1963) -0.142977 8 Heads in a Duffel Bag (1997) -0.577350 8 Seconds (1994) -0.755929 A Chef in Love (1996) 0.868599 Above the Rim (1994) -0.645497 Absolute Power (1997) 0.085440 Abyss, The (1989) 0.203709 Ace Ventura: Pet Detective (1994) 0.062689 Ace Ventura: When Nature Calls (1995) 0.094514 Across the Sea of Time (1995) -0.132453 Addams Family Values (1993) 0.132264 Addicted to Love (1997) 0.028173 Addiction, The (1995) 0.507093 Adventures of Pinocchio, The (1996) 0.111616 Adventures of Priscilla, Queen of the Desert, The (1994) 0.054740 Adventures of Robin Hood, The (1938) 0.144587 Affair to Remember, An (1957) 0.225451 African Queen, The (1951) 0.230540 Afterglow (1997) 0.420084 Age of Innocence, The (1993) -0.037176 ... Window to Paris (1994) NaN Wings of Courage (1995) NaN Wings of Desire (1987) -0.104771 Wings of the Dove, The (1997) 0.000000 Winnie the Pooh and the Blustery Day (1968) 0.142924 Winter Guest, The (1997) 0.353553 Wishmaster (1997) -0.066932 With Honors (1994) -0.111205 Withnail and I (1987) 0.083045 Witness (1985) NaN Wizard of Oz, The (1939) 0.266335 Wolf (1994) 0.113214 Woman in Question, The (1950) NaN Women, The (1939) -0.175608 Wonderful, Horrible Life of Leni Riefenstahl, The (1993) 0.707107 Wonderland (1997) NaN Wooden Man's Bride, The (Wu Kui) (1994) NaN World of Apu, The (Apur Sansar) (1959) -0.522233 Wrong Trousers, The (1993) 0.216204 Wyatt Earp (1994) 0.059560 Yankee Zulu (1994) NaN Year of the Horse (1997) -1.000000 You So Crazy (1994) NaN Young Frankenstein (1974) 0.192589 Young Guns (1988) 0.186377 Young Guns II (1990) 0.228615 Young Poisoner's Handbook, The (1995) -0.007374 Zeus and Roxanne (1997) 0.818182 unknown 0.723123 � k�ldum klaka (Cold Fever) (1994) NaN Length: 1664, dtype: float64
similarMovies = movieRatings.corrwith(starWarsRatings)
#print similarRatings
similarMovies = similarMovies.dropna()
similarMovies
title 'Til There Was You (1997) 0.872872 1-900 (1994) -0.645497 101 Dalmatians (1996) 0.211132 12 Angry Men (1957) 0.184289 187 (1997) 0.027398 2 Days in the Valley (1996) 0.066654 20,000 Leagues Under the Sea (1954) 0.289768 2001: A Space Odyssey (1968) 0.230884 39 Steps, The (1935) 0.106453 8 1/2 (1963) -0.142977 8 Heads in a Duffel Bag (1997) -0.577350 8 Seconds (1994) -0.755929 A Chef in Love (1996) 0.868599 Above the Rim (1994) -0.645497 Absolute Power (1997) 0.085440 Abyss, The (1989) 0.203709 Ace Ventura: Pet Detective (1994) 0.062689 Ace Ventura: When Nature Calls (1995) 0.094514 Across the Sea of Time (1995) -0.132453 Addams Family Values (1993) 0.132264 Addicted to Love (1997) 0.028173 Addiction, The (1995) 0.507093 Adventures of Pinocchio, The (1996) 0.111616 Adventures of Priscilla, Queen of the Desert, The (1994) 0.054740 Adventures of Robin Hood, The (1938) 0.144587 Affair to Remember, An (1957) 0.225451 African Queen, The (1951) 0.230540 Afterglow (1997) 0.420084 Age of Innocence, The (1993) -0.037176 Air Bud (1997) 0.012557 ... Whole Wide World, The (1996) 0.000000 Widows' Peak (1994) 0.136743 Wild America (1997) 0.040656 Wild Bill (1995) -0.286299 Wild Bunch, The (1969) -0.041299 Wild Reeds (1994) -0.377964 Wild Things (1998) -0.259437 William Shakespeare's Romeo and Juliet (1996) 0.098861 Willy Wonka and the Chocolate Factory (1971) 0.221902 Wings of Desire (1987) -0.104771 Wings of the Dove, The (1997) 0.000000 Winnie the Pooh and the Blustery Day (1968) 0.142924 Winter Guest, The (1997) 0.353553 Wishmaster (1997) -0.066932 With Honors (1994) -0.111205 Withnail and I (1987) 0.083045 Wizard of Oz, The (1939) 0.266335 Wolf (1994) 0.113214 Women, The (1939) -0.175608 Wonderful, Horrible Life of Leni Riefenstahl, The (1993) 0.707107 World of Apu, The (Apur Sansar) (1959) -0.522233 Wrong Trousers, The (1993) 0.216204 Wyatt Earp (1994) 0.059560 Year of the Horse (1997) -1.000000 Young Frankenstein (1974) 0.192589 Young Guns (1988) 0.186377 Young Guns II (1990) 0.228615 Young Poisoner's Handbook, The (1995) -0.007374 Zeus and Roxanne (1997) 0.818182 unknown 0.723123 Length: 1410, dtype: float64
similarMovies.sort_values(ascending=False)
title No Escape (1994) 1.000000 Man of the Year (1995) 1.000000 Hollow Reed (1996) 1.000000 Commandments (1997) 1.000000 Cosi (1996) 1.000000 Stripes (1981) 1.000000 Golden Earrings (1947) 1.000000 Mondo (1996) 1.000000 Line King: Al Hirschfeld, The (1996) 1.000000 Outlaw, The (1943) 1.000000 Hurricane Streets (1998) 1.000000 Scarlet Letter, The (1926) 1.000000 Safe Passage (1994) 1.000000 Good Man in Africa, A (1994) 1.000000 Full Speed (1996) 1.000000 Old Lady Who Walked in the Sea, The (Vieille qui marchait dans la mer, La) (1991) 1.000000 Star Wars (1977) 1.000000 Ed's Next Move (1996) 1.000000 Twisted (1996) 1.000000 Beans of Egypt, Maine, The (1994) 1.000000 Last Time I Saw Paris, The (1954) 1.000000 Maya Lin: A Strong Clear Vision (1994) 1.000000 Designated Mourner, The (1997) 0.970725 Albino Alligator (1996) 0.968496 Angel Baby (1995) 0.962250 Prisoner of the Mountains (Kavkazsky Plennik) (1996) 0.927173 Love in the Afternoon (1957) 0.923381 'Til There Was You (1997) 0.872872 A Chef in Love (1996) 0.868599 Guantanamera (1994) 0.866025 ... Pushing Hands (1992) -1.000000 Lamerica (1994) -1.000000 Year of the Horse (1997) -1.000000 Collectionneuse, La (1967) -1.000000 Dream Man (1995) -1.000000 S.F.W. (1994) -1.000000 Nightwatch (1997) -1.000000 Squeeze (1996) -1.000000 Glass Shield, The (1994) -1.000000 Slingshot, The (1993) -1.000000 Lover's Knot (1996) -1.000000 Tough and Deadly (1995) -1.000000 Sliding Doors (1998) -1.000000 Show, The (1995) -1.000000 Nil By Mouth (1997) -1.000000 Fall (1997) -1.000000 Sudden Manhattan (1996) -1.000000 Salut cousin! (1996) -1.000000 Neon Bible, The (1995) -1.000000 Crossfire (1947) -1.000000 Love and Death on Long Island (1997) -1.000000 For Ever Mozart (1996) -1.000000 Swept from the Sea (1997) -1.000000 Fille seule, La (A Single Girl) (1995) -1.000000 American Dream (1990) -1.000000 Theodore Rex (1995) -1.000000 I Like It Like That (1994) -1.000000 Two Deaths (1995) -1.000000 Roseanna's Grave (For Roseanna) (1997) -1.000000 Frankie Starlight (1995) -1.000000 Length: 1410, dtype: float64
movieStats = df.groupby('title').agg({'rating':[np.size,np.mean]})
movieStats.head(10)
rating | ||
---|---|---|
size | mean | |
title | ||
'Til There Was You (1997) | 9 | 2.333333 |
1-900 (1994) | 5 | 2.600000 |
101 Dalmatians (1996) | 109 | 2.908257 |
12 Angry Men (1957) | 125 | 4.344000 |
187 (1997) | 41 | 3.024390 |
2 Days in the Valley (1996) | 93 | 3.225806 |
20,000 Leagues Under the Sea (1954) | 72 | 3.500000 |
2001: A Space Odyssey (1968) | 259 | 3.969112 |
3 Ninjas: High Noon At Mega Mountain (1998) | 5 | 1.000000 |
39 Steps, The (1935) | 59 | 4.050847 |
popularMovies = movieStats[movieStats['rating']['size']>=100] #Threshold of 100 is arbitrary
popularMovies.sort_values(('rating','mean'),ascending=False)
rating | ||
---|---|---|
size | mean | |
title | ||
Close Shave, A (1995) | 112 | 4.491071 |
Schindler's List (1993) | 298 | 4.466443 |
Wrong Trousers, The (1993) | 118 | 4.466102 |
Casablanca (1942) | 243 | 4.456790 |
Shawshank Redemption, The (1994) | 283 | 4.445230 |
Rear Window (1954) | 209 | 4.387560 |
Usual Suspects, The (1995) | 267 | 4.385768 |
Star Wars (1977) | 584 | 4.359589 |
12 Angry Men (1957) | 125 | 4.344000 |
Citizen Kane (1941) | 198 | 4.292929 |
To Kill a Mockingbird (1962) | 219 | 4.292237 |
One Flew Over the Cuckoo's Nest (1975) | 264 | 4.291667 |
Silence of the Lambs, The (1991) | 390 | 4.289744 |
North by Northwest (1959) | 179 | 4.284916 |
Godfather, The (1972) | 413 | 4.283293 |
Secrets & Lies (1996) | 162 | 4.265432 |
Good Will Hunting (1997) | 198 | 4.262626 |
Manchurian Candidate, The (1962) | 131 | 4.259542 |
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963) | 194 | 4.252577 |
Raiders of the Lost Ark (1981) | 420 | 4.252381 |
Vertigo (1958) | 179 | 4.251397 |
Titanic (1997) | 350 | 4.245714 |
Lawrence of Arabia (1962) | 173 | 4.231214 |
Maltese Falcon, The (1941) | 138 | 4.210145 |
Empire Strikes Back, The (1980) | 368 | 4.206522 |
Boot, Das (1981) | 201 | 4.203980 |
Sling Blade (1996) | 136 | 4.198529 |
As Good As It Gets (1997) | 112 | 4.196429 |
Godfather: Part II, The (1974) | 209 | 4.186603 |
African Queen, The (1951) | 152 | 4.184211 |
... | ... | ... |
Ace Ventura: Pet Detective (1994) | 103 | 3.048544 |
Conan the Barbarian (1981) | 107 | 3.046729 |
Grumpier Old Men (1995) | 148 | 3.040541 |
Star Trek: The Motion Picture (1979) | 117 | 3.034188 |
Broken Arrow (1996) | 254 | 3.031496 |
First Wives Club, The (1996) | 160 | 3.018750 |
Net, The (1995) | 120 | 3.008333 |
Evita (1996) | 259 | 2.980695 |
Natural Born Killers (1994) | 128 | 2.953125 |
Lost World: Jurassic Park, The (1997) | 158 | 2.943038 |
Dante's Peak (1997) | 240 | 2.933333 |
Nutty Professor, The (1996) | 163 | 2.914110 |
101 Dalmatians (1996) | 109 | 2.908257 |
Father of the Bride Part II (1995) | 128 | 2.898438 |
Mars Attacks! (1996) | 217 | 2.847926 |
Multiplicity (1996) | 134 | 2.835821 |
Alien 3 (1992) | 100 | 2.830000 |
Volcano (1997) | 219 | 2.808219 |
Waterworld (1995) | 102 | 2.803922 |
Beavis and Butt-head Do America (1996) | 156 | 2.788462 |
Mimic (1997) | 101 | 2.742574 |
Down Periscope (1996) | 101 | 2.702970 |
George of the Jungle (1997) | 162 | 2.685185 |
Batman Returns (1992) | 142 | 2.683099 |
Batman Forever (1995) | 114 | 2.666667 |
Spawn (1997) | 143 | 2.615385 |
Event Horizon (1997) | 127 | 2.574803 |
Crash (1996) | 128 | 2.546875 |
Jungle2Jungle (1997) | 132 | 2.439394 |
Cable Guy, The (1996) | 106 | 2.339623 |
338 rows × 2 columns
df = popularMovies.join(pd.DataFrame(similarMovies,columns=['similarity']))
df.head(10)
C:\Users\nwerner\AppData\Local\Continuum\Anaconda2\lib\site-packages\pandas\core\reshape\merge.py:551: UserWarning: merging between different levels can give an unintended result (2 levels on the left, 1 on the right) warnings.warn(msg, UserWarning)
(rating, size) | (rating, mean) | similarity | |
---|---|---|---|
title | |||
101 Dalmatians (1996) | 109 | 2.908257 | 0.211132 |
12 Angry Men (1957) | 125 | 4.344000 | 0.184289 |
2001: A Space Odyssey (1968) | 259 | 3.969112 | 0.230884 |
Absolute Power (1997) | 127 | 3.370079 | 0.085440 |
Abyss, The (1989) | 151 | 3.589404 | 0.203709 |
Ace Ventura: Pet Detective (1994) | 103 | 3.048544 | 0.062689 |
Adventures of Priscilla, Queen of the Desert, The (1994) | 111 | 3.594595 | 0.054740 |
African Queen, The (1951) | 152 | 4.184211 | 0.230540 |
Air Force One (1997) | 431 | 3.631090 | 0.113164 |
Aladdin (1992) | 219 | 3.812785 | 0.191621 |
df.sort_values(['similarity'],ascending=False)[:15]
(rating, size) | (rating, mean) | similarity | |
---|---|---|---|
title | |||
Star Wars (1977) | 584 | 4.359589 | 1.000000 |
Empire Strikes Back, The (1980) | 368 | 4.206522 | 0.748353 |
Return of the Jedi (1983) | 507 | 4.007890 | 0.672556 |
Raiders of the Lost Ark (1981) | 420 | 4.252381 | 0.536117 |
Austin Powers: International Man of Mystery (1997) | 130 | 3.246154 | 0.377433 |
Sting, The (1973) | 241 | 4.058091 | 0.367538 |
Indiana Jones and the Last Crusade (1989) | 331 | 3.930514 | 0.350107 |
Pinocchio (1940) | 101 | 3.673267 | 0.347868 |
Frighteners, The (1996) | 115 | 3.234783 | 0.332729 |
L.A. Confidential (1997) | 297 | 4.161616 | 0.319065 |
Wag the Dog (1997) | 137 | 3.510949 | 0.318645 |
Dumbo (1941) | 123 | 3.495935 | 0.317656 |
Bridge on the River Kwai, The (1957) | 165 | 4.175758 | 0.316580 |
Philadelphia Story, The (1940) | 104 | 4.115385 | 0.314272 |
Miracle on 34th Street (1994) | 101 | 3.722772 | 0.310921 |