Topics
If there are any missing nltk modules, (nlt.xxxx), call nltk.download()
You can either download the missing modules individually, or download all packages
from nltk.tokenize import word_tokenize
word_tokenize("Hello world")
message = 'I am Nolan Werner, from west africa'
message.split()
word_tokenize("This's a car")
import nltk
from nltk.tokenize import sent_tokenize
# sent_tokenize tokenizes by sentences. It is used to find the list of sentences
text="Welcome readers. I hope you find it interesting. Please do reply"
print(sent_tokenize(text))
# As usual, we use loops to do lot of things. But, list comprehension makes
# things so much easier for us, mostly in a very beautiful fashion.
# Suppose that we have a list of numbers, ie
integerNumbers = [0,1,2,3,4,5,6,7,8,9]
# Create an array that contains the square of each elements
size = len(integerNumbers)
reservoir = [0]*size # need this to be populated
for i in xrange(len(integerNumbers)):
reservoir[i] = integerNumbers[i]**2
reservoir
# The above work could have also be done by just appending
reservoir = []
for i in xrange(size):
reservoir.append(integerNumbers[i]**2)
reservoir
# The same job with list comprehension
[i**2 for i in xrange(10)]
# list comprehesension of selecting the even numbers from 0 to 10
[i for i in xrange(10) if i%2==0]
[(x,y)
for x in xrange(2)
for y in['a','b','c']]
# conditional if can be used within a list comprehension
even_numbers = [val for val in xrange(10) if val%2==0]
even_numbers
[i**2 if i%2==0 else i**3 for i in xrange(10)] # Please take a minute and
# see what is going on here
vowels = ['a','']
[word.upper() if word in['a', 'e', 'i', 'o', 'u'] else word.lower() for word in 'africa']
# dictionary comprehension
{i:i**2 for i in xrange(10)}
# Take a list words and attach to each word, its length.
wordList = ['Moussa','John','Aristide','Abraham','Obama']
{word:len(word) for word in wordList}
# Map takes as arguments a function and an iterable object and apply
# the function and applies it to each element of the ietarable object.
# The result is a list.
def square(x): return x**2
list(map(square,xrange(10)))
list(map(lambda x: x**2,xrange(10)))
map(lambda word:len(word),'Arshad is very smart'.split())
# split takes the string and breaks it into a lsit of words
# Filter, as its name stands for, selects elemts that statisfy a given
# conditions
even_numbers = list(filter(lambda x:x%2==0, xrange(10)))
even_numbers
word_with_len_2 = filter(lambda w:len(w)==2,['I','am','from','India'])
word_with_len_2
# Reduce be looked at as follows:
# suppose we have a list of integers [val1,val2,val3,val4,val5].
# And, you aim to sum all the elements present in the list. In here,
# our function is add or +. Thus, the total sum will be done as:
# ((((val1+val2)+val3)+val4)+val5)
reduce(lambda x,y: x+y,[1,2,3,4,5])
# Guess what? One can initialize the sum
reduce(lambda x,y: x+y,[1,2,3,4,5],30) # I suppose my total sum is 30 at
# the beginning
reduce(lambda x,y:x*y,[1,2,3,4,5]) # I am multiplying all the elements of
# the list
reduce(lambda x,y:x*y,[1,2,3,4,5],30) # In here, i initialize my product by 30
# let's put map,filter and at work together
# For, let's suppose i have the elements from 1 to 10. Let's compute
# the sum of the square of all the odd elements in our list
reduce(lambda x,y: x+y,map(lambda x:x**2,filter(lambda x: x%2==1,range(10))))
import nltk
# word_tokenize is used to find the list of words in strings
text = nltk.word_tokenize("PierreVinke, 59 years old, will join as a nonexecutive director on Nov. 29.")
print(text)
# Treebank tokenizer uses regular expressions to tokenize texts
import nltk
from nltk.tokenize import TreebankWordTokenizer
# Treebank tokenizer uses regualar expressions to tokenize text
tokenizer = TreebankWordTokenizer()
print (tokenizer.tokenize("Have a nice day. I hope you find the book interesting"))
print (tokenizer.tokenize("Don't hesitate to ask questions"))
from nltk.tokenize import WordPunctTokenizer
tokenizer = WordPunctTokenizer()
print (tokenizer.tokenize("Don't hesitate to ask questions"))
import nltk
from nltk.tokenize import RegexpTokenizer
sent = "She secures 90.56% in class X. She is a meritorious student"
capt = RegexpTokenizer('[A-Z]\w+')
capt.tokenize(sent)
import nltk
from nltk.tokenize import BlanklineTokenizer
sent = '''She secures
90.56% in class X.
She is a meritorious student'''
BlanklineTokenizer().tokenize(sent)
from nltk.stem import PorterStemmer
stemmerporter = PorterStemmer()
from nltk.stem import PorterStemmer
stemmerporter = PorterStemmer()
print(stemmerporter.stem('talking'))
print (stemmerporter.stem('happiness'))
print (stemmerporter.stem('happy'))
print (stemmerporter.stem('unhappy'))
print (stemmerporter.stem('ran'))
print (stemmerporter.stem('is'))
words = ['houses', 'trains', 'pens', 'cars', 'eaten','sick', 'nice', 'bought', 'selling', 'sized',
'speech', 'rolling', 'marching', 'identification', 'universal', 'beautiful', 'references', 'countries','called']
single = [stemmerporter.stem(word) for word in words]
single
import nltk
from nltk.stem import LancasterStemmer
stemmerLan = LancasterStemmer()
print (stemmerLan.stem('happiness'))
print (stemmerLan.stem('happy'))
print (stemmerLan.stem('unhappy'))
print (stemmerLan.stem('ran'))
print (stemmerLan.stem('is'))
import nltk
from nltk.stem import RegexpStemmer
stemmerreg = RegexpStemmer('ing')
print (stemmerreg.stem('working'))
print (stemmerreg.stem('happiness'))
print (stemmerreg.stem('pairing'))
import nltk
from nltk.stem import SnowballStemmer
print (SnowballStemmer.languages)
spanishstemmer = SnowballStemmer('spanish')
print (spanishstemmer.stem('comiendo'))
frenchstemmer = SnowballStemmer('french')
print (frenchstemmer.stem('manger'))
spanishstemmer = SnowballStemmer('french')
print (spanishstemmer.stem('danser'))
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer_output = WordNetLemmatizer()
print (lemmatizer_output.lemmatize('working', pos='v'))
print (lemmatizer_output.lemmatize('ran', pos='v'))
print (lemmatizer_output.lemmatize('took', pos='v'))
print (lemmatizer_output.lemmatize('is', pos='v'))
print (lemmatizer_output.lemmatize('happiness'))
print (lemmatizer_output.lemmatize('took'))
- conjunction of coordinations get mapped to cc
- adverbs get mapped to RB
- prepositions get mapped to IN
- something gets mapped to NN
- adjectives get mapped to jj
- verbs get mapped to VBZ
import nltk
from nltk import word_tokenize
text = word_tokenize("It is a pleasant day today")
nltk.pos_tag(text) #pos_tagger stands for part of speech tagger
text = word_tokenize("They buy the permit in order to be able to attend the event")
nltk.pos_tag(text)
import nltk
from nltk.tag import DefaultTagger
tag = DefaultTagger('He is the man')
tag.tag(['Beautiful', 'morning'])
!pip install --trusted-host pypi.python.org autocorrect
from autocorrect import spell
spell("Tghe")
!pip install --trusted-host pypi.python.org textblob
from textblob import TextBlob
b = TextBlob("I havv good speling!")
print(b.detect_language())
print (b.correct())
from textblob import Word
w = Word('falability')
w.spellcheck()
from langdetect import detect
print (detect("War doesn't show who's right, just who's left."))
print (detect("Ein, zwei, drei, vier"))
print (detect("Eu gosto de mulher"))
!pip install --trusted-host pypi.python.org langdetect
#en_blob = TextBlob(u'Simple is better than complex.')
#en_blob.translate(to='vi') # vi stands for vietnamese
en_blob = TextBlob(u'I am a free black man loved by Jesus Christ.')
en_blob.translate(to='pt')
- by using CountVectorizer(what the hell is this?) then feeding the output of that into TfidfTransformer.
- by directly inputing the collection of text or documents to TfidfVectorizer
To vectorize (vectorization) aims at turning a collection of text documents into numerical feature vector
# Questions?
# what is the difference between fit(),
#fit() : is used to generate learning model parameters from training data
#transform() : parameters generated from fit() method,applied upon model to
# generate transformed data set.
# fit_transform() : combines fit() and transform() applied on same data sets
import numpy as np
import scipy as sp
import pandas as pd
# we need to import and instantiate CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
simple_train = ['Call you tonight', 'Call me a cab', 'please call me... PLEASE!']
vect=CountVectorizer() # CountVectorizer allows one Convert a collection of text
# documents to a matrix of token count. The outputed matrix is a sparse one. What is
# a sparse matrix?
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
# Take the text or the document and learn the vocabulary
tf
# we can see that it is not displaying the character a. This is mainly because the default
# Check on this case.
vect = CountVectorizer(binary=True)
df = vect.fit_transform(simple_train).toarray().sum(axis=0) # why does axis have to be zero?
pd.DataFrame(df.reshape(1,6), columns=vect.get_feature_names())
# This is about the document frequency
tf/df # why is that? What is the purpose?
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=1) # min_df represents a threshold. Here, it puts a
# constraint by telling while building building the vocabulary you need to ignore terms
# that have a document frequency strictly lower than the given threshold
print (vectorizer)
corpus = ['This is the first document','This is the second second document', 'And the third one', 'Is this the first document?']
X = vectorizer.fit_transform(corpus)
tf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
tf
print (X)
import pandas as pd
import numpy as np
import scipy as sp
from sklearn.cross_validation import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from textblob import TextBlob, Word
from nltk.stem.snowball import SnowballStemmer
%matplotlib inline
yelp = pd.read_csv('yelp.csv')
yelp.head()
yelp.info()
#create a new DataFrame that only contains the 5-star and 1-star reviews
yelp_best_worst = yelp[(yelp.stars==5) | (yelp.stars==1)]
yelp_best_worst.reset_index(drop=True, inplace=True) # reset the indices. And instead of
# creating another data frame, let's just do it inplace
x = yelp_best_worst.text #reviews
y = yelp_best_worst.stars #ratings
# print x to look at x
# print y to take a look at
print (x.shape)
#split into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x,y,random_state=1)
print x
# use CountVetorizer to create document-term matrices from x_train and x_test
vect = CountVectorizer()
x_train_dtm = vect.fit_transform(x_train) # learn the vocabulary dictionary ad create term document matrix
print (x_train_dtm)
#print (x_train_dtm.shape)
x_test_dtm= vect.transform(x_test)
#print x_test_dtm
#x_test_dtm.shape
print x_test
tf = pd.DataFrame(x_train_dtm.toarray(), columns=vect.get_feature_names())
tf.head()
x_train.head()
#don't lowercase
vect = CountVectorizer(lowercase=False)
x_train_dtm = vect.fit_transform(x_train)
x_train_dtm.shape
# include 1-grams and 2-grams (an n-gram is N-grams is just all combinations of adjacent words
# or letters of length n that you can find in your source text)
vect = CountVectorizer(ngram_range=(1,2))
x_train_dtm = vect.fit_transform(x_train)
x_train_dtm.shape
print (vect.get_feature_names()[-50:]) # The last 50 words
vect = CountVectorizer()
x_train_dtm = vect.fit_transform(x_train)
x_test_dtm = vect.transform(x_test)
# Questions?
# what is the difference between f
#fit() : is used to generate learning model parameters from training data
#transform() : parameters generated from fit() method,applied upon model to
# generate transformed data set.
# fit_transform() : combines fit() and transform() api on same data sets
#Naive Bayes
nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)
y_pred_class = nb.predict(x_test_dtm)
print (metrics.accuracy_score(y_test, y_pred_class))
y_test_binary = np.where(y_test==5, 1, 0) max(y_test_binary.mean(), 1-y_test_binary.mean())
#define a function that accepts a vectorizer and calculates the accuracy
def tokenize_test(vect):
x_train_dtm = vect.fit_transform(x_train)
print ('Features: ', x_train_dtm.shape[1])
x_test_dtm = vect.transform(x_test)
nb = MultinomialNB()
nb.fit(x_train_dtm, y_train)
y_pred_class = nb.predict(x_test_dtm)
print ('Accuracy: ', metrics.accuracy_score(y_test, y_pred_class))
#include 1-grams and 2-grams
vect = CountVectorizer(ngram_range=(1,2))
tokenize_test(vect)
vect = CountVectorizer()
tokenize_test(vect)
#remove English stop words
vect = CountVectorizer(stop_words='english')
tokenize_test(vect)
# set of stop words
print (vect.get_stop_words())
#max_features
vect = CountVectorizer(stop_words='english', max_features=100)
tokenize_test(vect)
print(vect.get_feature_names())
vect = CountVectorizer(ngram_range=(1,2), max_features=100000)
tokenize_test(vect)
#min_df sets the minimum document frequency allowed when creating vocab
vect = CountVectorizer(ngram_range=(1,2), min_df=2)
tokenize_test(vect)
print (yelp_best_worst.text[0])
review = TextBlob(yelp_best_worst.text[0])
review.words
review.sentences
review.lower()
stemmer = SnowballStemmer('english')
print ([stemmer.stem(word) for word in review.words])
print ([word.lemmatize() for word in review.words])
#assume every word is a verb
print ([word.lemmatize(pos='v') for word in review.words])
def split_into_lemmas(text):
text = unicode(text, 'utf-8').lower() #Python 2
#text = str(text).lower() #Python 3
words = TextBlob(text).words
#return [word.lemmatize() for word in words]
return [stemmer.stem(word) for word in words]
#split review text into lemmas rather than into words (default)
vect = CountVectorizer(analyzer=split_into_lemmas)
tokenize_test(vect)
print (vect.get_feature_names()[-50:])
This is a repeat of the code from the TF-IDF intro section
#example documents
simple_train = ['call you tonight', 'Call me a cab', 'please call me... PLEASE!']
#term frequency
vect = CountVectorizer()
tf = pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
tf
#document frequency
vect = CountVectorizer(binary=True)
df = vect.fit_transform(simple_train).toarray().sum(axis=0)
pd.DataFrame(df.reshape(1,6), columns=vect.get_feature_names())
#term frequency- inverse document frequency (tf-idf) - Simple version
tf/df
vect = TfidfVectorizer()
pd.DataFrame(vect.fit_transform(simple_train).toarray(), columns=vect.get_feature_names())
#create a document-term matrix using TF-ID
vect = TfidfVectorizer(stop_words='english')
dtm = vect.fit_transform(yelp.text)
features = vect.get_feature_names()
dtm.shape
def summarize():
#choose a random review that is at least 300 characters
review_length = 0
while review_length < 300:
review_id = np.random.randint(0, len(yelp))
review_text = unicode(yelp.text[review_id], 'utf-8') #Python 2
#review_text = str(yelp.text[review_id]) #Python3
review_length = len(review_text)
#create a dictionary of words and their TF-IDF scores
word_scores = {}
for word in TextBlob(review_text).words:
word = word.lower()
if word in features:
word_scores[word] = dtm[review_id, features.index(word)]
#print words with the top 5 TF-IDF scores
print ('TOP SCORING WORDS:')
top_scores = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)[:5]
for word, score in top_scores:
print (word)
#print the review
print ('\n' + review_text)
summarize()
print (review)
max_i = 0
max_polarity = -float('inf')
min_i = 0
min_polarity = float('inf')
for i in range(len(yelp_best_worst.text)):
review_text = unicode(yelp_best_worst.text[i], 'utf-8') #Python 2
#review_text = str(yelp_best_worst.text[i]) #Python3
this_polarity = TextBlob(review_text).sentiment.polarity
if this_polarity > max_polarity:
max_i = i
max_polarity = this_polarity
if this_polarity < min_polarity:
min_i = i
min_polarity = this_polarity
print (TextBlob(yelp_best_worst.text[max_i]))
print (TextBlob(yelp_best_worst.text[min_i]))
#polarity ranges from -1 (most negative) to 1 (most positive)
print(review.sentiment.polarity)
print(max_polarity)
print(min_polarity)
#understanding the apply method
yelp['length'] = yelp.text.apply(len)
yelp.head(10)
#define a function that accepts text and returns polarity
def detect_sentiment(text):
return TextBlob(text.decode('utf-8')).sentiment.polarity #Python 2
#return TextBlob(text).sentiment.polarity Python 3
#create a new DataFrame column for sentiment
yelp['sentiment'] = yelp.text.apply(detect_sentiment)
yelp.boxplot(column='sentiment', by='stars')
#reviews with most positive sentiment
yelp[yelp.sentiment == 1].text.head()
#reviews with most negative sentiment
yelp[yelp.sentiment == -1].text.head()
# spelling correction
TextBlob('15 minuets late').correct()
# spellcheck
Word('parot').spellcheck()
# definitions
Word('bank').define('v')
# language identification
TextBlob('Hola amigos').detect_language()
# language identification
TextBlob('Hola amigos').translate(from_lang='auto', to='en')
#sentiment
TextBlob('That movie was good.').sentiment