numpy
The numpy is efficient array implementation in Python suitable for large data processing.
Synopsis:
import numpy as np
# Convert list to numpy array.
my_list = [1,2,3]
npa = np.array(my_list)
npa
array([1, 2, 3])
type(npa)
<class 'numpy.ndarray'> # Implemented as python class ndarray
>>> help(np.array)
array(...)
array(object, dtype=None, copy=True, order='K', subok=False, ndmin=0)
>>> a.astype(float)
array([1., 2., 3.]) # upcasting or downcasting.
# Convert 2D list to numpy array
>>> my_matrix = [[1,2,3],[4,5,6],[7,8,9]]
>>> npa = np.array(my_matrix)
>>> npa
array([[1, 2, 3],
[4, 5, 6],
[7, 8, 9]])
# Generate array using arange() function.
>>> np.arange(0,10)
array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
>>> np.arange(0,11,2)
array([ 0, 2, 4, 6, 8, 10])
# Generate 3 zeros 1-dimensional array (float by default)
>>> np.zeros(3)
array([0., 0., 0.])
# A 2D array of zeros.
>>> np.zeros((5,5))
>>> np.ones(3)
array([1., 1., 1.])
>>> np.ones((3,3)) # Displays 2D numpy array of 1.0's
# ### linspace
# Return evenly spaced numbers over a specified interval.
>>> np.linspace(0,10,3)
array([ 0., 5., 10.])
>>> np.linspace(0,10,50) # 50 numbers in 0-10 interval both inclusive.
# Note: interval is slightly > 0.2
array([ 0. , 0.20408163, 0.40816327, 0.6122449 , 0.81632653,
.....
9.18367347, 9.3877551 , 9.59183673, 9.79591837, 10. ])
)
#
# Create an identity matrix
>>> np.eye(4)
array([[1., 0., 0., 0.],
[0., 1., 0., 0.],
[0., 0., 1., 0.],
[0., 0., 0., 1.]])
# Create random samples from a uniform distribution
# rand === uniform distribution; Space = [0, 1) by default.
>>> np.random.rand(2) # Get me 2 random numbers.
array([0.75099837, 0.22669422])
>>> np.random.rand(2) * 100 # Change space from 0-1 to 0-100
array([97.17772849, 55.63771402])
>>> np.random.rand(5,5) # Get me 5x5 random numbers in 0-1 space.
array([[0.51726016, 0.5028085 , 0.18335166, 0.10000506, 0.32411374],
....
[0.45835505, 0.13112003, 0.23983673, 0.11599024, 0.51640221]])
# ### randn === normal distribution random numbers
#
np.random.randn(2) # Get me 2 random numbers from normal distribution.
np.random.randn(5,5) # Get me 5x5 numbers.
# Two-by-four array of samples from N(mean = 3, variance = 6.25)
# i.e. std deviation = sqrt(6.25) = 2.25
>>> 2.5 * np.random.randn(2, 4) + 3
array([[-4.49401501, 4.00950034, -1.81814867, 7.29718677],
[ 0.39924804, 4.68456316, 4.99394529, 4.84057254]])
# ### randint === "discrete uniform" distribution
# Return random integers from `low` (inclusive) to `high` (exclusive).
>>> np.random.randint(1,100) # Get single discrete uniform number.
30
>>> np.random.randint(1,100,10) # Get 10 discrete uniform numbers.
array([19, 72, 72, 54, 49, 22, 90, 61, 35, 44])
# ## Reshape
>>> arr = np.arange(25)
>>> arr.reshape(5,5) # Fill row-wise, i.e. first fill first row.
array([[ 0, 1, 2, 3, 4],
[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19],
[20, 21, 22, 23, 24]])
# ### max,min,argmax,argmin
#
# Find max or min values and indices.
ranarr = np.random.rand(10) * 100 # Get 10 random numbers from 0-100 space.
ranarr.max() # Get max
ranarr.argmax() # Get max index.
ranarr.min()
ranarr.argmin()
>>> arr = np.arange(5)
>>> arr.reshape(1,5)
array([[0, 1, 2, 3, 4]]) # 1 row of 5 values. Note: it is 2D.
>>> arr.reshape(1,5).shape
(1,5)
>>> arr.reshape(5,1) # 5 rows of single column.
array([[0],
[1],
[2],
[3],
[4]])
#
# Array elements are of homogenous type. Get data type.
#
>>> arr.dtype
dtype('int64')
>>> arr.astype(float).dtype
dtype('float64')
# Array slice syntax is similar to std list ...
>>> arr = np.arange(10) # 0 ... 9
>>> arr[0:5]
array([0, 1, 2, 3, 4])
# Array Broadcasting -- Assign number to range.
# Note: This is not possible with standard list
>>> arr[0:5]=100
>>> arr
array([100, 100, 100, 100, 100, 5, 6, 7, 8, 9])
#
# By default slice is not a separate copy.
#
>>> arr = np.arange(10)
>>> my_slice = arr[0:5]
>>> my_slice[:] = 100
>>> arr
array([100, 100, 100, 100, 100, 5, 6, 7, 8, 9])
# If you want a slice with copy, call copy()
>>> my_slice = arr[0:5].copy()
>>> arr3x3 = np.arange(9).reshape(3, 3)
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
>>> arr3x3[1]
array([3, 4, 5])
# Can index as [i, j] or [i][j]
>>> arr3x3[1][0]
3
>>> arr3x3[1,0]
3
# Matrix Array slicing works as you expect it to be ...
# Submatrix (2,2) from top right corner
>>> arr3x3[:2,1:] # Rows: 0, 1; Cols: 1, 2
array([[1, 2],
[4, 5]])
# You can index by explicit lists
>>> arr3x3[[2,0]] # Order 3rd row, then first row...
array([[6, 7, 8],
[0, 1, 2]])
# List indexing is not exactly same as 'Slice indexing'
>>> arr3x3[[2,0],[0,1]]
array([6, 1]) # This returns smaller dimension results.
>>> arr3x3[[2,0], 0:2]
array([[6, 7], # Slice indexing for column: Larger dim result
[0, 1]])
>>> arr = np.arange(1,6)
>>> arr
array([1, 2, 3, 4, 5])
>>> bool_arr = arr > 3
>>> bool_arr
array([False, False, False, True, True])
>>> arr[bool_arr]
array([4, 5])
>>> arr[arr > 3]
array([4, 5])
#
# Array Arithmetic
#
>>> arr = np.arange(0,5) # 0, 1, 2, 3, 4
>>> arr + arr # Add element-wise
>>> arr - arr # All elments become 0.0
>>> arr * arr # Multiply element wise.
array([ 0, 1, 4, 9, 16])
>>> arr ** 3 # Element wise exponentiation.
#
# Division by 0 is just warning! Returns nan. Elementwise division.
#
>>> arr/arr
__main__:1: RuntimeWarning: invalid value encountered in true_divide
array([nan, 1., 1., 1., 1.])
# You can do 1/0 - It is infinity -- Just warning.
>>> 1/arr
__main__:1: RuntimeWarning: divide by zero encountered in true_divide
array([ inf, 1. , 0.5 , 0.33333333, 0.25 ])
#
# Can do common math opeartions: e.g. np.sqrt, np.exp, np.log, np.sin, etc.
#
>>> np.sqrt(arr)
array([0. , 1. , 1.41421356, 1.73205081, 2. ])
>>> arr.sum()
>>> np.arange(9).reshape(3,3)
array([[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
>>> np.arange(9).reshape(3,3).sum(axis=0) # Remember axis as what should be fixed.
# Sum is done by collapsing other axis.
array([ 9, 12, 15])
>>> arr.std() # Standard deviation
We need pandas for it's powerful data frame features. Let us take a closer look from fundamentals ... Pandas Series, then Data Frame.
>>> s = pd.Series(data=[10, 20, 30])
>>> s
0 10
1 20
2 30
dtype: int64
#
# Both index value and data value could be non-integer objects.
#
>>> s['hello'] = 'world'
>>> s
0 10
1 20
2 30
hello world
dtype: object
>>> s.rename_axis("idx", axis = 0) # Name the index as 'idx'
>>> s
idx
0 10
1 20
2 30
hello world
dtype: object
>>> s2 = pd.Series(data=[2, 4, '!!!' ], index=[1, 2, 'hello'])
>>> s + s2 # You can add two series. Common index will get added.
0 NaN # 10 + unknown = NaN
1 22 # 20 + 2
2 34 # 30 + 4
hello world!!! # 'world' + '!!!'
dtype: object
mx3 = np.arange(6).reshape(3,2)
array([[0, 1],
[2, 3],
[4, 5]])
df = pd.DataFrame(mx3,index='R1 R2 R3'.split(), columns='C1 C2'.split())
df
C1 C2
R1 0 1
R2 2 3
R3 4 5
df['C1']
R1 0
R2 2
R3 4
Name: C1, dtype: int64
df[['C2', 'C1']]
C2 C1
R1 1 0
R2 3 2
R3 5 4
df.C1 # supported. But df.R1 is not supported. Recommended: df['C1']
R1 0
R2 2
R3 4
Name: C1, dtype: int64
df['newCol'] = df['C1'] + df['C2']
C1 C2 newCol
R1 0 1 1
R2 2 3 5
R3 4 5 9
df.drop('newCol',axis=1, inplace = True) # Drop any column. Affects same df.
df.drop('R3', axis=0, inplace = True) # Drop any Row.
# Append a row ...
>>> df.loc['R4'] = [ 6, 7 ]
>>> df
C1 C2
R1 0 1
R2 2 3
R3 4 5
R4 6 7
>>> df.loc['R5'] = 8 # Single value to replicate row ...
>>> df
C1 C2
R1 0 1
R2 2 3
R3 4 5
R4 6 7
R5 8 8
>>> df.loc['R3']
C1 4
C2 5
Name: R3, dtype: int64
# df.iloc[2] === df.loc['R3'] i.e. You can also refer using row index using iloc
#
# Subsetting of dataframe. Unlike numpy array, this yields larger dim results as you expect.
#
df.loc[['R1', 'R2'], ['C1', 'C2']]
C1 C2
R1 0 1
R2 2 3
df > 2 # Build boolean df.
C1 C2
R1 False False
R2 False True
R3 True True
R4 True True
>>> df[df>2] # Set NaN where it is False.
C1 C2
R1 NaN NaN
R2 NaN 3.0
R3 4.0 5.0
R4 6.0 7.0
>>> df[df.C2 > 3] # Filter rows by condition.
C1 C2
R3 4 5
R4 6 7
>>> df[ (df.C2 > 2) & (df.C1 > 4) ] # Complex condition with & or |
C1 C2
R4 6 7
>>> df.reset_index(inplace=True) # Reset index to 0..N
>>> df
index C1 C2 # index is preserved as col.
0 R1 0 1 # index is Not composite index.
1 R2 2 3
2 R3 4 5
3 R4 6 7
# Add new column and set that as index.
>>> p = 'P1 P2 P3 P4'.split()
>>> df['Person'] = p
>>> df
index C1 C2 Person
0 R1 0 1 P1
1 R2 2 3 P2
2 R3 4 5 P3
3 R4 6 7 P4
>>> df.set_index(df.Person, inplace=True)
>>> df
index C1 C2 Person
Person
P1 R1 0 1 P1
P2 R2 2 3 P2
P3 R3 4 5 P3
P4 R4 6 7 P4
The iloc indexing is used to:
# ================= Using iloc ====================================
# Single selections using iloc and DataFrame
# Rows:
data.iloc[0] # first row of data frame - Note a pd.Series data type output.
data.iloc[[0]] # first row of data frame - Force data frame output format.
data.iloc[1] # second row of data frame
data.iloc[-1] # last row of data frame
# Columns:
data.iloc[:,0] # first column of data frame (first_name)
data.iloc[:,1] # second column of data frame (last_name)
data.iloc[:,-1] # last column of data frame (id)
# Multiple row and column selections using iloc and DataFrame
data.iloc[0:5] # first five rows of dataframe
data.iloc[:, 0:2] # first two columns of data frame with all rows
data.iloc[[0,2], [5,7]] # 1st, 3rd rows + 6th and 8th columns.
data.iloc[0:5, 5:8] # first 5 rows and 5-7th columns of data frame
Note:
The loc indexing can be used for :
# ================= Using loc ========================================================================
data.set_index("last_name", inplace=True) # Take one column make that as "Row names" as key.
data.head()
# Select rows with index values 'Raja' and 'Guru', with all columns between 'city' and 'email'
data.loc[['Raja', 'Guru'], 'city':'email']
# Select same rows, with just 'first_name', 'address' and 'city' columns
data.loc['Raja':'Guru', ['first_name', 'address', 'city']]
# Change the index to be based on the 'id' column
data.set_index('id', inplace=True)
# select the row with 'id' = 487
data.loc[487]
#
# Select rows with first name Raja, # and all columns between 'city' and 'email'
#
data.loc[data['first_name'] == 'Raja', 'city':'email']
# Select rows with first name Raja AND hotmail email addresses; All columns.
data.loc[data['email'].str.endswith("gmail.com") & (data['first_name'] == 'Raja')]
# select rows with id column between 100 and 200, and just return 'email' and 'pincode' columns
data.loc[(data['id'] > 100) & (data['id'] <= 200), ['email', 'pincode']]
# A lambda function that yields True/False values can also be used.
# Select rows where the company name has 4 words in it.
data.loc[data['company_name'].apply(lambda x: len(x.split(' ')) == 4)]
# Selections can be achieved outside of the main .loc for clarity:
# Form a separate variable with your selections:
idx = data['company_name'].apply(lambda x: len(x.split(' ')) == 4)
# Select only the True values in 'idx' and only the 3 columns specified:
data.loc[idx, ['email', 'first_name', 'company']]
# ############# Setting values conditionally ##############################
# Change the first name of all rows with an ID greater than 2000 to "Raja"
data.loc[data['id'] > 2000, "first_name"] = "Raja"
# NaN and Dealing with Missing Data
>>> df.dropna() # Drop all rows which contains atleast 1 NaN
>>> df.dropna(axis=1) # Drop all columns which contains atleast 1 NaN
>>> df.dropna(thresh=2) # Drop rows containing 2 or more NaN values
>>> df.fillna(value='MyUnknown') # All Nan values assigned this value.
>>> df.fillna(value=0) # All Nan values assigned to 0.
>>> df['A'].fillna(value=df['A'].mean()) # NaN value for one column set to avg.
# Pandas Group By
data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
'Person':['G1','G2','M1','M2','F1','F2'],
'Sales':[700,400,500,300,200,800]}
df = pd.DataFrame(data)
g_by = df.groupby('Company')
g_by
<pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f3d743140b8>
>>> g_by.mean() # Also See: std(), etc.
# Preserves only applicable numeric cols on output
Sales
Company
FB 500
GOOG 550
MSFT 400
# For min/max single output values, Sales output is reliable.
# The other columns values are not reliable.
>>> g_by.min()
Person Sales
Company
FB F1 200
GOOG G1 400 # Person value is wrong.
MSFT M1 300
#
# Even non-numerical cols included in count output
# But it shows same count value irrespective of column.
#
>>> g_by.count()
Person Sales
Company
FB 2 2
GOOG 2 2
MSFT 2 2
>>> g_by.describe()
Sales
count mean std min 25% 50% 75% max
Company
FB 2.0 500.0 424.264069 200.0 350.0 500.0 650.0 800.0
GOOG 2.0 550.0 212.132034 400.0 475.0 550.0 625.0 700.0
MSFT 2.0 400.0 141.421356 300.0 350.0 400.0 450.0 500.0
>>> g_by.describe().transpose()
Company FB GOOG MSFT
Sales count 2.000000 2.000000 2.000000
mean 500.000000 550.000000 400.000000
std 424.264069 212.132034 141.421356
min 200.000000 400.000000 300.000000
25% 350.000000 475.000000 350.000000
50% 500.000000 550.000000 400.000000
75% 650.000000 625.000000 450.000000
max 800.000000 700.000000 500.000000
You can concatenate aka append one with another. Having same columns is preferred. Otherwise, Nan values will be filled as needed :
pd.concat([df1,df2,df3])
C1 C2 C3 NewC
0
1 NaN
2 NaN
..
Note: Row index can not have duplicates.
If you want to force merging dataframes along columns:
pd.concat([df1,df2,df3], axis=1) # Total cols = sum(each df total cols)
C1 C2 C3 C1 C2 C3 # Col names now contains duplicates!
R1
R2
R3
You can merge dataframes using SQL inner/outer join logic :
pd.merge(left,right,how='inner',on='key')
A B key C D
0 A0 B0 K0 C0 D0
1 A1 B1 K1 C1 D1
2 A2 B2 K2 C2 D2
3 A3 B3 K3 C3 D3
Joining by right outer join :
pd.merge(left, right, on=['key1', 'key2'])
A B key1 key2 C D
0 A0 B0 K0 K0 C0 D0
1 A2 B2 K1 K0 C1 D1
2 A2 B2 K1 K0 C2 D2
Consider df1, df2 with complete disjoint column names with some common index values:
left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
'B': ['B0', 'B1', 'B2']},
index=['K0', 'K1', 'K2'])
right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
'D': ['D0', 'D2', 'D3']},
index=['K0', 'K2', 'K3'])
left.join(right) # Preserve all rows of left. Being index, there is atmost 1 match in right.
# Note: We ignore non-matching index (K3) rows in right dataframe.
A B C D
K0 A0 B0 C0 D0
K1 A1 B1 NaN NaN
K2 A2 B2 C2 D2
# Join by index could also be outer join. Then K3 row will be included.
left.join(right, how='outer')
R | Pandas
-------------------------------
summary(df) | df.describe()
head(df) | df.head()
dim(df) | df.shape
slice(df, 1:10) | df.iloc[:9]
str(df) | df.info(); **Note-below
table(df$col) | df['col'].value_counts() # Values by decreasing Freq.
The R function str(df) gives you a short peek into column values which pandas df.info() does not give. But you can easily write a custom function which is even more useful to give you a peek of unique values :
def rstr(df):
print('Dimension: ', df.shape)
print(df.apply(lambda x: [x.unique()])) # Peek into unique values.
Also R offers instant access to many datasets to play with. You can also do the same with Python but you must install some useful packages :
pip install pydataset
from pydataset import data
data() # See info summary of 700+ datasets available
dataset_id title
0 AirPassengers Monthly Airline Passenger Numbers 1949-1960
1 BJsales Sales Data with Leading Indicator
.....
755 grouseticks Data on red grouse ticks from Elston et al. 2001
756 sleepstudy Reaction times in a sleep deprivation study
>>> iris = data('iris')
>>> iris.describe()
Sepal.Length Sepal.Width Petal.Length Petal.Width
count 150.000000 150.000000 150.000000 150.000000
mean 5.843333 3.057333 3.758000 1.199333
std 0.828066 0.435866 1.765298 0.762238
min 4.300000 2.000000 1.000000 0.100000
25% 5.100000 2.800000 1.600000 0.300000
50% 5.800000 3.000000 4.350000 1.300000
75% 6.400000 3.300000 5.100000 1.800000
max 7.900000 4.400000 6.900000 2.500000
>>> iris.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
Sepal.Length 150 non-null float64
Sepal.Width 150 non-null float64
Petal.Length 150 non-null float64
Petal.Width 150 non-null float64
Species 150 non-null object
dtypes: float64(4), object(1)
memory usage: 7.0+ KB
>>> iris.head()
Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1 5.1 3.5 1.4 0.2 setosa
2 4.9 3.0 1.4 0.2 setosa
3 4.7 3.2 1.3 0.2 setosa
4 4.6 3.1 1.5 0.2 setosa
5 5.0 3.6 1.4 0.2 setosa
>>> def rstr(df):
...
... print('Dimension: ', df.shape)
... print(df.apply(lambda x: [x.unique()])) # Peek into unique values.
...
>>> rstr(iris)
Dimension: (150, 5)
Sepal.Length [[5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.4, 4.8, 4.3, 5.8, 5.7, 5.2, 5.5, 4.5, 5.3, 7.0, 6.4,...
Sepal.Width [[3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 2.9, 3.7, 4.0, 4.4, 3.8, 3.3, 4.1, 4.2, 2.3, 2.8,...
Petal.Length [[1.4, 1.3, 1.5, 1.7, 1.6, 1.1, 1.2, 1.0, 1.9, 4.7, 4.5, 4.9, 4.0, 4.6, 3.3, 3.9, 3.5,...
Petal.Width [[0.2, 0.4, 0.3, 0.1, 0.5, 0.6, 1.4, 1.5, 1.3, 1.6, 1.0, 1.1, 1.8, 1.2, 1.7, 2.5, 1.9,...
Species [[setosa, versicolor, virginica]]
dtype: object
>>> iris['Sepal.Length'].value_counts().head()
5.0 10 # The value 5.0 occurs most 10 times.
6.3 9 # Ordered descending by frequency of values.
5.1 9
6.7 8
5.7 8
Name: Sepal.Length, dtype: int64
#
# Order value counts by value instead of frequency ...
#
>>> iris['Sepal.Length'].value_counts().sort_index(ascending=False).head()
7.9 1 # Highest value occurs one time
7.7 4 # There are 4 data points with value = 7.7
7.6 1
7.4 1
7.3 1
Name: Sepal.Length, dtype: int64
Note
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
df = pd.read_csv('iris.csv')
df.head(5)
df.columns
# Output: Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species'], dtype='object')
df.shape # Returns tuple (total_rows, total_columns)
df.ndim # Total number of dimensions. For all data frames it is just 2
# Let us try to predict the last column, i.e. Species
X = dataset.iloc[:, :-1].values # Everything except last column
y = dataset.iloc[:, 4].values # Only the last column.
# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
len(X_train)
# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""