# Import the numpy library. Gives you tools for manipulating data in arrays and more
import numpy as np
# Create a 1X3 numpy array
np.array([1,2,3])
# Create a 3X2 numpy array
np.array([[1,2],[3,4],[4,5]])
np.zeros([2,3])
np.ones([3,2])
np.diag(np.ones(4))
# Create a numpy array filled with values from 0 to 10 with a jump of 2
np.arange(0,10,2)
np.linspace(0,10,5)
# Create a numpy array filled with (4) evenly spaced values from 0 to 1
np.linspace(0,1,4)
np.linspace(0,1,5)
np.linspace(0,1,27)
Array Math
# Cannont add 10 to all values because this is a list
x = [1,2,3,4,5]
print x+10
# Add to to all values using a numpy array
x = [1,2,3,4,5]
x = np.array(x)
print x +10
y = np.array([1,2,3,4,True,"A"])
print y
# Prints out all the items as strings
# Arrays are homogeneous in that they convert all the items to the same data type
z = np.delete(y,3)
print z
a = np.arange(0,11,1)
a
# Using numpy array indexing
x = np.arange(4)
print x
print x[0]
print x[-1]
x = np.arange(6).reshape(2,3)
print x
#The number of rows and columns must multiply to the number of items correctly
#Else the reshape will not work
print x[1,2]
print x[-1]
#Print last row of the array
print x[-1,-1]
print x[-2,-2]
print x[-2,-3]
#Printing numbers of an array working backwards
# array[row,column]
a = np.arange(10)
print a[:7]
print a[-3:]
y = np.arange(9).reshape(3,3)
y[:2,1:2]
array1 = np.arange(25).reshape(5,5)
array1
z = array1.ravel()
z
#Flatten an array to a single row
# Array z with all values arranged in opposite order
z[::-1]
# Array z with every 3rd values in opposite order
z[::-3]
children = np.array([0,1,2,2,5])
house_sum = sum(children)
num_of_fam = len(children)
print "The mean of children in a house is", house_sum/num_of_fam
print "The mean of children in a house is", np.mean(children)
m = int(len(children)/2)
print "The median of children in a house is", children[m]
bill = 10000
ted = 24640
cyrus = 30000
john = 25000
trump = 100000000
income = [bill, ted, cyrus, john, trump]
sorted_income = sorted(income)
print "Median of sorted income is",sorted_income[int(len(sorted_income)/2)]
print "Median of sorted income is",int(np.median(sorted_income))
# Median is less affected by outliers
# Mean is more affected by outliers.
# When filling in empty data spaces, may want to use median when data has lots of outliers
income = np.random.normal(24000,15000,10000)
income_with_trump = np.append(income,trump)
# Created a random normal distribution with:
# Median = 24000
# Standard Deviation = 15000
# Number of Points = 10000
print "Mean:",np.mean(income),"Median:", np.median(income),"Max:", max(income),"Min:", min(income)
print "Mean:",np.mean(income_with_trump),"Median:", np.median(income_with_trump),"Max:", max(income_with_trump),"Min:", min(income_with_trump)
import matplotlib.pyplot as plt
%matplotlib inline
plt.hist(income,100)
# Plot (data, number of rectangular bars/histograms)
plt.show()
print np.mode(income)
# numby does not have mode. Will create an error message
ages = np.random.randint(2,high=25, size=50)
ages
from scipy import stats
print stats.mode(ages)
d = np.array([1,4,5,4,8])
m = np.mean(d)
differ = (d-m)**2
sum_differ = sum(differ)
var = sum_differ/len(d)
std_dev = var**1/2
print var, std_dev
income = np.random.normal(100,20,10000)
print min(income), max(income)
PANDAS
import pandas as pd
df = pd.DataFrame({"name":["Bob","Jen","Tim","Jacob"],"age":[20,30,40,50],"pet":["cat","dog","bird","hamster"]})
df
df.columns
df.index
df[1:4]
# Not displaying the first index
df["name"]
df["pet"]
df[["name","pet"]]
# In order to display just 2 columns, first make a list of the columns you want to display.
# Thus you are passing just a single list object
df.name
df['pet']
df.ix[1:3]
df = pd.DataFrame({"name":["Bob","Jen","Tim","Jacob"],"age":[20,30,40,50],"pet":["cat","dog","bird","hamster"]})
df.sort_values('pet',inplace=True)
df
df.ix[0]
df.iloc[0,2]
df.ix[0:1]
df.reset_index(inplace=True)
df
df.drop('index', axis=1, inplace=True)
df
df.iloc[:,0:2]
df.loc[:,'name':'pet']
# Display just columns you desired by filtering by column header names
data = pd.read_csv('test_pandas.csv',header=None)
data
# Read a CSV file
# If header !=None, the first excel row will become the header
data.to_csv('test_pandas_no_header.csv',header=False,index=False)
#write to a csv file
data = pd.read_excel('test_pandas.xlsm','Sheet1')
data
# reading an excel file
writer = pd.ExcelWriter('test_sheets.xlsx')
data.to_excel(writer,'Original')
data.to_excel(writer,'Copy')
writer.save()
# writing to an excel file
import sqlite3
conn = sqlite3.connect('test_pandas.db')
# Create connection to data base via sql
sqlite3.version
tables = conn.execute("SELECT name FROM sqlite_master WHERE type='table';")
for name in tables:
print name[0]
# reading the names of the tables
sql_guery = "SELECT * FROM test"
data = pd.read_sql(sql_guery,conn)
data
# Reading the sql file
# * Means choose all
# Choosing to read all the data
new_data = pd.DataFrame({'id':[6,7],'city':['Milwaukee','New England'],'mascot':['Cheesehead','Lions']})
new_data = new_data[['id','city','mascot']]
new_data
new_data.to_sql('new_test2',conn)
# Writing data to sql
new_data.to_sql('test',conn,if_exists='append', index=False)
conn.close()
from xml.etree import ElementTree as et
#parses the file
doc = et.parse('cars.xml')
print doc.find('CAR/MODEL').text
print doc.find('CAR[2]/MODEL').text
for element in doc.findall('CAR'):
print element.find('MAKE').text + " " + element.find('MODEL').text + ", " + element.find('COST').text
import requests
#retrieve an xml document from a web server
xml = requests.get("http://www.w3schools.com/xml/cd_catalog.xml")
print xml.content
#Writing the code back to a local file
with open('test.xml','wb') as code:
code.write(xml.content)
doc = et.parse("test.xml")
#outputs the album, artist, and year of each CD to the screen
for element in doc.findall('CD'):
print 'Album: ', element.find('TITLE').text
print 'Artists: ', element.find('ARTIST').text
print 'Year: ', element.find('YEAR').text, "\n"
titanicData = pd.read_csv('train.csv')
titanicData
titanicData.info()
#This gives you the info for your data set
titanicData.isnull().sum()
titanicData.head(10)
titanicData.tail(10)
# nunique = number of unique values
print titanicData['Pclass'].nunique()
# provides the unique values
titanicData['Pclass'].unique()
titanicData['Sex'].unique()
# provides the number counts of each value
titanicData['Pclass'].value_counts()
titanicData.describe()
a = titanicData[['Survived','Name']]
a
titanicData.loc[:5,'PassengerId':'Name']
print titanicData['Sex'].unique()
titanicData['Sex'].value_counts()
titanicData[titanicData.Sex=='male']
titanicData[(titanicData.Sex == 'male') & (titanicData.Age > 18)]
len(titanicData[(titanicData.Sex == 'male') & (titanicData.Age > 18)])
print len(titanicData[(titanicData.Survived == 1)])
print len(titanicData[(titanicData.Survived == 0)])
titanicData[(titanicData.Survived == 1)]
titanicData[titanicData.Sex=='male']
num_men_survived = float(len(titanicData[(titanicData.Survived == 1) & (titanicData.Sex == 'male')]))
num_women_survived = float(len(titanicData[(titanicData.Survived == 1) & (titanicData.Sex == 'female')]))
total_men = float(len(titanicData[titanicData.Sex == 'male']))
total_women = float(len(titanicData[titanicData.Sex == 'female']))
men_survival_rate = round(num_men_survived/total_men*100,2)
women_survival_rate = round(num_women_survived/total_women*100,2)
print "Male Survival Rate:",men_survival_rate,"percent. Female Survival Rate:", women_survival_rate,"percent."