numpy

The numpy is efficient array implementation in Python suitable for large data processing.

Synopsis:

import numpy as np

# Convert list to numpy array.
my_list = [1,2,3]
npa = np.array(my_list)
npa
          array([1, 2, 3])

type(npa)
          <class 'numpy.ndarray'>   # Implemented as python class ndarray

>>> help(np.array)

       array(...)
          array(object, dtype=None, copy=True, order='K', subok=False, ndmin=0)

>>> a.astype(float)
                    array([1., 2., 3.])   # upcasting or downcasting.

# Convert 2D list to numpy array
>>> my_matrix = [[1,2,3],[4,5,6],[7,8,9]]
>>> npa = np.array(my_matrix)
>>> npa

  array([[1, 2, 3],
         [4, 5, 6],
         [7, 8, 9]])

# Generate array using arange() function.
>>> np.arange(0,10)
                  array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

>>> np.arange(0,11,2)
                  array([ 0,  2,  4,  6,  8, 10])

# Generate 3 zeros 1-dimensional array   (float by default)
>>> np.zeros(3)
  array([0., 0., 0.])

# A 2D array of zeros.
>>> np.zeros((5,5))

>>> np.ones(3)
  array([1., 1., 1.])

>>> np.ones((3,3))    # Displays 2D numpy array of 1.0's


# ### linspace
# Return evenly spaced numbers over a specified interval.

>>> np.linspace(0,10,3)   
                          array([ 0.,  5., 10.])

>>> np.linspace(0,10,50)  # 50 numbers in 0-10 interval both inclusive.
                          # Note: interval is slightly > 0.2

  array([ 0.        ,  0.20408163,  0.40816327,  0.6122449 ,  0.81632653,
         .....
          9.18367347,  9.3877551 ,  9.59183673,  9.79591837, 10.        ])
       )

#
# Create an identity matrix

>>> np.eye(4)

  array([[1., 0., 0., 0.],
         [0., 1., 0., 0.],
         [0., 0., 1., 0.],
         [0., 0., 0., 1.]])


# Create random samples from a uniform distribution
#       rand === uniform distribution; Space = [0, 1) by default.

>>> np.random.rand(2)     # Get me 2 random numbers.

              array([0.75099837, 0.22669422])

>>> np.random.rand(2)  * 100    # Change space from 0-1 to 0-100

              array([97.17772849, 55.63771402])

>>> np.random.rand(5,5)   # Get me 5x5 random numbers in 0-1 space.

       array([[0.51726016, 0.5028085 , 0.18335166, 0.10000506, 0.32411374],
              ....
              [0.45835505, 0.13112003, 0.23983673, 0.11599024, 0.51640221]])


# ### randn  === normal distribution random numbers
#

np.random.randn(2)     # Get me 2 random numbers from normal distribution. 

np.random.randn(5,5)   # Get me 5x5 numbers.

# Two-by-four array of samples from N(mean = 3, variance = 6.25) 
# i.e. std deviation = sqrt(6.25) = 2.25

>>> 2.5 * np.random.randn(2, 4) + 3

         array([[-4.49401501,  4.00950034, -1.81814867,  7.29718677], 
                [ 0.39924804,  4.68456316,  4.99394529,  4.84057254]])

# ### randint  === "discrete uniform" distribution
# Return random integers from `low` (inclusive) to `high` (exclusive).

>>> np.random.randint(1,100)           # Get single discrete uniform number.

       30

>>> np.random.randint(1,100,10)        # Get 10 discrete uniform numbers.

       array([19, 72, 72, 54, 49, 22, 90, 61, 35, 44])


# ## Reshape

>>> arr = np.arange(25)
>>> arr.reshape(5,5)      # Fill row-wise, i.e. first fill first row.

  array([[ 0,  1,  2,  3,  4],
         [ 5,  6,  7,  8,  9],
         [10, 11, 12, 13, 14],
         [15, 16, 17, 18, 19],
         [20, 21, 22, 23, 24]])

# ### max,min,argmax,argmin
#
# Find max or min values and indices. 

ranarr = np.random.rand(10)  * 100    # Get 10 random numbers from 0-100 space.
ranarr.max()              # Get max
ranarr.argmax()           # Get max index.
ranarr.min()
ranarr.argmin()


>>> arr = np.arange(5)
>>> arr.reshape(1,5)         

        array([[0, 1, 2, 3, 4]])     # 1 row of 5 values. Note: it is 2D.

>>> arr.reshape(1,5).shape
           (1,5)

>>> arr.reshape(5,1)              # 5 rows of single column.

              array([[0],
                     [1],
                     [2],
                     [3],
                     [4]])

#
# Array elements are of homogenous type. Get data type.
#

>>> arr.dtype

    dtype('int64')

>>> arr.astype(float).dtype

    dtype('float64')


# Array slice syntax is similar to std list ...
>>> arr = np.arange(10)   # 0 ... 9
>>> arr[0:5]

    array([0, 1, 2, 3, 4])

# Array Broadcasting -- Assign number to range. 
# Note: This is not possible with standard list

>>> arr[0:5]=100
>>> arr   
         array([100, 100, 100, 100, 100,   5,   6,   7,   8,   9])
#
# By default slice is not a separate copy.
#
>>> arr = np.arange(10)
>>> my_slice = arr[0:5]
>>> my_slice[:] = 100
>>> arr

     array([100, 100, 100, 100, 100,   5,   6,   7,   8,   9])

# If you want a slice with copy, call copy()
>>> my_slice = arr[0:5].copy()

>>> arr3x3 = np.arange(9).reshape(3, 3)

  array([[0, 1, 2],
     [3, 4, 5],
     [6, 7, 8]])

>>> arr3x3[1]    

    array([3, 4, 5])

#  Can index as [i, j] or [i][j]
>>> arr3x3[1][0]
                    3
>>> arr3x3[1,0]
                    3

# Matrix Array slicing works as you expect it to be ...
# Submatrix (2,2) from top right corner

>>> arr3x3[:2,1:]       # Rows: 0, 1; Cols: 1, 2

       array([[1, 2],
              [4, 5]])

# You can index by explicit lists
>>> arr3x3[[2,0]]   # Order 3rd row, then first row...

      array([[6, 7, 8],
           [0, 1, 2]])

# List indexing is not exactly same as 'Slice indexing'
>>> arr3x3[[2,0],[0,1]] 

     array([6, 1])    # This returns smaller dimension results.

>>> arr3x3[[2,0], 0:2] 

    array([[6, 7],    # Slice indexing for column: Larger dim result
           [0, 1]])

>>> arr = np.arange(1,6)     

>>> arr
         array([1, 2, 3, 4, 5])

>>> bool_arr = arr > 3
>>> bool_arr
        array([False, False, False,  True,  True])

>>> arr[bool_arr]
        array([4, 5])

>>> arr[arr > 3]
        array([4, 5])

#
#                         Array Arithmetic
#

>>> arr = np.arange(0,5)    # 0, 1, 2, 3, 4
>>> arr + arr               # Add element-wise
>>> arr - arr               # All elments become 0.0
>>> arr * arr               # Multiply element wise.

      array([ 0,  1,  4,  9, 16])

>>> arr ** 3               # Element wise exponentiation.

#
# Division by 0 is just warning! Returns nan. Elementwise division.
#

>>>  arr/arr
     __main__:1: RuntimeWarning: invalid value encountered in true_divide
     array([nan,  1.,  1.,  1.,  1.])

# You can do 1/0 - It is infinity -- Just warning.
>>> 1/arr

   __main__:1: RuntimeWarning: divide by zero encountered in true_divide
   array([       inf, 1.        , 0.5       , 0.33333333, 0.25      ])

#
# Can do common math opeartions: e.g. np.sqrt, np.exp, np.log, np.sin, etc.
#
>>> np.sqrt(arr)

    array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ])

>>> arr.sum()          
>>> np.arange(9).reshape(3,3)

                array([[0, 1, 2],
                       [3, 4, 5],
                       [6, 7, 8]])

>>> np.arange(9).reshape(3,3).sum(axis=0)  # Remember axis as what should be fixed.
                                           # Sum is done by collapsing other axis.
                array([ 9, 12, 15])

>>> arr.std()          # Standard deviation

Pandas

We need pandas for it's powerful data frame features. Let us take a closer look from fundamentals ... Pandas Series, then Data Frame.

Pandas Series

  • It is built on top of numpy arrays.
  • It can have axis labels. i.e. Can index by label.
  • Can hold non-trivial data types i.e. objects.
>>> s = pd.Series(data=[10, 20, 30])
>>> s
        0    10            
        1    20
        2    30

        dtype: int64

#
# Both index value and data value could be non-integer objects.
#
>>> s['hello'] = 'world'
>>> s
          0           10
          1           20
          2           30
          hello    world

          dtype: object

>>> s.rename_axis("idx", axis = 0)     # Name the index as 'idx'
>>> s
        idx
        0        10
        1        20
        2        30
        hello    world
        dtype: object

>>> s2 =  pd.Series(data=[2, 4, '!!!' ], index=[1, 2, 'hello'])        

>>> s + s2        # You can add two series. Common index will get added.

              0             NaN           # 10 + unknown = NaN
              1              22           # 20 + 2
              2              34           # 30 + 4
              hello    world!!!           # 'world' + '!!!'
              dtype: object

DataFrame Operations

mx3 = np.arange(6).reshape(3,2)

       array([[0, 1],
           [2, 3],
           [4, 5]])

df = pd.DataFrame(mx3,index='R1 R2 R3'.split(), columns='C1 C2'.split())
df
              C1  C2
          R1   0   1
          R2   2   3
          R3   4   5

df['C1']

          R1    0
          R2    2
          R3    4
          Name: C1, dtype: int64

df[['C2', 'C1']]

               C2  C1
          R1   1   0
          R2   3   2
          R3   5   4

df.C1   # supported. But df.R1 is not supported. Recommended: df['C1']

          R1    0
          R2    2
          R3    4
          Name: C1, dtype: int64

df['newCol'] = df['C1'] + df['C2']

                 C1  C2  newCol
            R1   0   1       1
            R2   2   3       5
            R3   4   5       9

df.drop('newCol',axis=1, inplace = True)    # Drop any column. Affects same df.
df.drop('R3',    axis=0, inplace = True)    # Drop any Row. 

# Append a row ...

>>> df.loc['R4'] = [ 6, 7 ]
>>> df
              C1  C2
           R1   0   1
           R2   2   3
           R3   4   5
           R4   6   7

>>> df.loc['R5'] = 8          # Single value to replicate row ...
>>> df
              C1  C2
          R1   0   1
          R2   2   3
          R3   4   5
          R4   6   7
          R5   8   8

>>> df.loc['R3']
          C1    4
          C2    5
          Name: R3, dtype: int64

# df.iloc[2] === df.loc['R3']  i.e. You can also refer using row index using iloc

#
# Subsetting of dataframe. Unlike numpy array, this yields larger dim results as you expect.
#
df.loc[['R1', 'R2'], ['C1', 'C2']]

                   C1  C2
              R1   0   1
              R2   2   3
df > 2                          # Build boolean df.

                     C1     C2
              R1  False  False
              R2  False   True
              R3   True   True
              R4   True   True

>>> df[df>2]                    # Set NaN where it is False.

                   C1   C2
              R1  NaN  NaN
              R2  NaN  3.0
              R3  4.0  5.0
              R4  6.0  7.0

>>> df[df.C2 > 3]            # Filter rows by condition.
                  C1  C2
              R3   4   5
              R4   6   7

>>> df[ (df.C2 > 2) & (df.C1 > 4) ]   # Complex condition with & or |

                  C1  C2
              R4   6   7

>>> df.reset_index(inplace=True)      # Reset index to 0..N
>>> df

                index  C1  C2         # index is preserved as col. 
              0    R1   0   1         # index is Not composite index.
              1    R2   2   3
              2    R3   4   5
              3    R4   6   7

# Add new column and set that as index.

>>> p = 'P1 P2 P3 P4'.split()
>>> df['Person'] = p
>>> df
              index  C1  C2 Person
            0    R1   0   1     P1
            1    R2   2   3     P2
            2    R3   4   5     P3
            3    R4   6   7     P4

>>> df.set_index(df.Person, inplace=True)
>>> df

                       index  C1  C2 Person
            Person
            P1        R1   0   1     P1
            P2        R2   2   3     P2
            P3        R3   4   5     P3
            P4        R4   6   7     P4

DataFrame MultiIndex Support

Using iloc vs loc for dataframe indexing

The iloc indexing is used to:

  • Filter rows by using integer range of rows and columns.
  • If the result is single row or column, it returns Pandas Series, otherwise DataFrame.
  • You can force Dataframe result by passing single element list as index.
# =================  Using  iloc ====================================
# Single selections using iloc and DataFrame

# Rows:

data.iloc[0]   # first row of data frame - Note a pd.Series data type output.
data.iloc[[0]] # first row of data frame - Force data frame output format.
data.iloc[1]   # second row of data frame 
data.iloc[-1]  # last row of data frame 

# Columns:

data.iloc[:,0]    # first column of data frame (first_name)
data.iloc[:,1]    # second column of data frame (last_name)
data.iloc[:,-1]   # last column of data frame (id)

# Multiple row and column selections using iloc and DataFrame
data.iloc[0:5]        # first five rows of dataframe
data.iloc[:, 0:2]     # first two columns of data frame with all rows
data.iloc[[0,2], [5,7]] # 1st, 3rd rows + 6th and 8th columns.
data.iloc[0:5, 5:8]   # first 5 rows and 5-7th columns of data frame

Note:

  • The .iloc returns a Pandas Series when only one row or one column is selected otherwise returns DataFrame.
  • To counter this, pass a single-valued list if you require DataFrame output.
  • In practice, you rarely use the iloc indexer, unless you want the first ( .iloc[0] ) or the last ( .iloc[-1] ) row, etc.

The loc indexing can be used for :

  • Selecting rows by label/index
  • Selecting rows with a boolean / conditional lookup vector
# =================  Using  loc ========================================================================

data.set_index("last_name", inplace=True)          # Take one column make that as "Row names" as key.
data.head()


# Select rows with index values 'Raja' and 'Guru', with all columns between 'city' and 'email'
data.loc[['Raja', 'Guru'], 'city':'email']

# Select same rows, with just 'first_name', 'address' and 'city' columns
data.loc['Raja':'Guru', ['first_name', 'address', 'city']]

# Change the index to be based on the 'id' column
data.set_index('id', inplace=True)
# select the row with 'id' = 487
data.loc[487]

#               
# Select rows with first name Raja, # and all columns between 'city' and 'email'
#
data.loc[data['first_name'] == 'Raja', 'city':'email']

# Select rows with first name Raja AND hotmail email addresses; All columns.
data.loc[data['email'].str.endswith("gmail.com") & (data['first_name'] == 'Raja')]

# select rows with id column between 100 and 200, and just return 'email' and 'pincode' columns
data.loc[(data['id'] > 100) & (data['id'] <= 200), ['email', 'pincode']]

# A lambda function that yields True/False values can also be used.
# Select rows where the company name has 4 words in it.
data.loc[data['company_name'].apply(lambda x: len(x.split(' ')) == 4)]

# Selections can be achieved outside of the main .loc for clarity:
# Form a separate variable with your selections:
idx = data['company_name'].apply(lambda x: len(x.split(' ')) == 4)
# Select only the True values in 'idx' and only the 3 columns specified:
data.loc[idx, ['email', 'first_name', 'company']]

# #############   Setting values conditionally ##############################
# Change the first name of all rows with an ID greater than 2000 to "Raja"
data.loc[data['id'] > 2000, "first_name"] = "Raja"

DataFrame Handling Missing Data

# NaN and Dealing with Missing Data

>>> df.dropna() # Drop all rows which contains atleast 1 NaN

>>> df.dropna(axis=1) # Drop all columns which contains atleast 1 NaN

>>> df.dropna(thresh=2) # Drop rows containing 2 or more NaN values

>>> df.fillna(value='MyUnknown') # All Nan values assigned this value.

>>> df.fillna(value=0) # All Nan values assigned to 0.

>>> df['A'].fillna(value=df['A'].mean()) # NaN value for one column set to avg.

Dataframe Group By

# Pandas Group By

data = {'Company':['GOOG','GOOG','MSFT','MSFT','FB','FB'],
   'Person':['G1','G2','M1','M2','F1','F2'],
   'Sales':[700,400,500,300,200,800]}

df = pd.DataFrame(data)
g_by = df.groupby('Company')
g_by
        <pandas.core.groupby.groupby.DataFrameGroupBy object at 0x7f3d743140b8>

>>> g_by.mean()            # Also See: std(), etc.
                           # Preserves only applicable numeric cols on output

                        Sales
              Company
              FB         500
              GOOG       550
              MSFT       400

# For min/max single output values, Sales output is reliable.
# The other columns values are not reliable.

>>> g_by.min()

                         Person  Sales
              Company
              FB          F1    200
              GOOG        G1    400   # Person value is wrong.
              MSFT        M1    300

#
# Even non-numerical cols included in count output
# But it shows same count value irrespective of column.
#
>>> g_by.count()
                       Person  Sales   
              Company
              FB            2      2
              GOOG          2      2
              MSFT          2      2

>>> g_by.describe()
                      Sales                                                      
                      count   mean         std    min    25%    50%    75%    max
              Company                                                            
              FB        2.0  500.0  424.264069  200.0  350.0  500.0  650.0  800.0
              GOOG      2.0  550.0  212.132034  400.0  475.0  550.0  625.0  700.0
              MSFT      2.0  400.0  141.421356  300.0  350.0  400.0  450.0  500.0

>>> g_by.describe().transpose()
      Company              FB        GOOG        MSFT
      Sales count    2.000000    2.000000    2.000000
            mean   500.000000  550.000000  400.000000
            std    424.264069  212.132034  141.421356
            min    200.000000  400.000000  300.000000
            25%    350.000000  475.000000  350.000000
            50%    500.000000  550.000000  400.000000
            75%    650.000000  625.000000  450.000000
            max    800.000000  700.000000  500.000000

Dataframe Concatenation by row or column

You can concatenate aka append one with another. Having same columns is preferred. Otherwise, Nan values will be filled as needed :

pd.concat([df1,df2,df3])

      C1 C2 C3 NewC
  0 
  1            NaN
  2            NaN
  ..

Note: Row index can not have duplicates.

If you want to force merging dataframes along columns:

pd.concat([df1,df2,df3], axis=1)   # Total cols = sum(each df total cols)

          C1 C2 C3  C1 C2 C3   # Col names now contains duplicates!
    R1
    R2
    R3

Dataframe Inner/Outer Joins aka Merging

You can merge dataframes using SQL inner/outer join logic :

pd.merge(left,right,how='inner',on='key')

    A   B   key C   D
0   A0  B0  K0  C0  D0
1   A1  B1  K1  C1  D1
2   A2  B2  K2  C2  D2
3   A3  B3  K3  C3  D3

Joining by right outer join :

pd.merge(left, right, on=['key1', 'key2'])

    A   B   key1 key2  C   D
0   A0  B0  K0   K0    C0  D0
1   A2  B2  K1   K0    C1  D1
2   A2  B2  K1   K0    C2  D2

Dataframe Join using Index

Consider df1, df2 with complete disjoint column names with some common index values:

left = pd.DataFrame({'A': ['A0', 'A1', 'A2'],
                     'B': ['B0', 'B1', 'B2']},
                      index=['K0', 'K1', 'K2'])

right = pd.DataFrame({'C': ['C0', 'C2', 'C3'],
                    'D': ['D0', 'D2', 'D3']},
                      index=['K0', 'K2', 'K3'])

left.join(right)   # Preserve all rows of left. Being index, there is atmost 1 match in right.
                   # Note: We ignore non-matching index (K3) rows in right dataframe.

            A   B   C   D
        K0  A0  B0  C0  D0
        K1  A1  B1  NaN NaN
        K2  A2  B2  C2  D2

# Join by index could also be outer join. Then K3 row will be included.
left.join(right, how='outer')

Dataframe Misc Operations

  • Apply function: df['mycol'].apply(lambda x : 2 * x)
  • Display head: df.head()
  • Sorting: df.sort_values(by='col2')

Pandas input/output

  • df = pd.read_csv('example.csv')
  • df.to_csv('example.csv',index=False)
  • pd.read_excel('Excel_Sample.xlsx',sheetname='Sheet1')
  • df.to_excel('Excel_Sample.xlsx',sheet_name='Sheet1')
  • df_list = pd.read_html('http://www.fdic.gov/bank/individual/failed/banklist.html') Each table in html is parsed as dataframe !
  • Read db table as dataframe: df = read_sql_table(table_name, con[, schema, ...]) Also see: read_sql_query()
  • Write data to SQL database: DataFrame.to_sql(name, con[, flavor, ...])

Compute Correlation between columns

  • Use df[ ['col1', 'col2', 'col3' ] ].corr() to produce correlation matrix. The higher absolute numerical values indicates correlation. values close to zero do not.

Pandas Equivalent for R Functions

R               | Pandas
-------------------------------
summary(df)     | df.describe()
head(df)        | df.head()
dim(df)         | df.shape
slice(df, 1:10) | df.iloc[:9]
str(df)         | df.info(); **Note-below
table(df$col)   | df['col'].value_counts()  # Values by decreasing Freq.

The R function str(df) gives you a short peek into column values which pandas df.info() does not give. But you can easily write a custom function which is even more useful to give you a peek of unique values :

def rstr(df): 

  print('Dimension: ', df.shape)
  print(df.apply(lambda x: [x.unique()])) # Peek into unique values.

Also R offers instant access to many datasets to play with. You can also do the same with Python but you must install some useful packages :

  • The scikit-learn package comes with several useful datasets.
  • Install pydatasets package to get almost all R built-in datasets with convenience function of loading with data('datasetname').
pip install pydataset
from pydataset import data
data()                            # See info summary of 700+ datasets available

                 dataset_id                                          title

     0        AirPassengers    Monthly Airline Passenger Numbers 1949-1960
     1              BJsales              Sales Data with Leading Indicator
     .....
     755        grouseticks    Data on red grouse ticks from Elston et al. 2001
     756         sleepstudy    Reaction times in a sleep deprivation study

>>> iris = data('iris')

>>> iris.describe()

          Sepal.Length  Sepal.Width  Petal.Length  Petal.Width
   count    150.000000   150.000000    150.000000   150.000000
   mean       5.843333     3.057333      3.758000     1.199333
   std        0.828066     0.435866      1.765298     0.762238
   min        4.300000     2.000000      1.000000     0.100000
   25%        5.100000     2.800000      1.600000     0.300000
   50%        5.800000     3.000000      4.350000     1.300000
   75%        6.400000     3.300000      5.100000     1.800000
   max        7.900000     4.400000      6.900000     2.500000


>>> iris.info()

   <class 'pandas.core.frame.DataFrame'>
   Int64Index: 150 entries, 1 to 150
   Data columns (total 5 columns):
   Sepal.Length    150 non-null float64
   Sepal.Width     150 non-null float64
   Petal.Length    150 non-null float64
   Petal.Width     150 non-null float64
   Species         150 non-null object
   dtypes: float64(4), object(1)
   memory usage: 7.0+ KB

>>> iris.head()

      Sepal.Length  Sepal.Width  Petal.Length  Petal.Width Species
   1           5.1          3.5           1.4          0.2  setosa
   2           4.9          3.0           1.4          0.2  setosa
   3           4.7          3.2           1.3          0.2  setosa
   4           4.6          3.1           1.5          0.2  setosa
   5           5.0          3.6           1.4          0.2  setosa

>>> def rstr(df):
   ...
   ...       print('Dimension: ', df.shape)
   ...       print(df.apply(lambda x: [x.unique()])) # Peek into unique values.
   ...

>>> rstr(iris)

   Dimension:  (150, 5)
   Sepal.Length    [[5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.4, 4.8, 4.3, 5.8, 5.7, 5.2, 5.5, 4.5, 5.3, 7.0, 6.4,...
   Sepal.Width     [[3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 2.9, 3.7, 4.0, 4.4, 3.8, 3.3, 4.1, 4.2, 2.3, 2.8,...
   Petal.Length    [[1.4, 1.3, 1.5, 1.7, 1.6, 1.1, 1.2, 1.0, 1.9, 4.7, 4.5, 4.9, 4.0, 4.6, 3.3, 3.9, 3.5,...
   Petal.Width     [[0.2, 0.4, 0.3, 0.1, 0.5, 0.6, 1.4, 1.5, 1.3, 1.6, 1.0, 1.1, 1.8, 1.2, 1.7, 2.5, 1.9,...
   Species                                                                 [[setosa, versicolor, virginica]]
   dtype: object

>>> iris['Sepal.Length'].value_counts().head()

         5.0    10             # The value 5.0 occurs most 10 times.
         6.3     9             # Ordered descending by frequency of values.
         5.1     9
         6.7     8
         5.7     8
         Name: Sepal.Length, dtype: int64

#
# Order value counts by value instead of frequency ...
#
>>> iris['Sepal.Length'].value_counts().sort_index(ascending=False).head()

       7.9    1                 # Highest value occurs one time
       7.7    4                 # There are 4 data points with value = 7.7
       7.6    1
       7.4    1
       7.3    1
       Name: Sepal.Length, dtype: int64

Note

  • Keep in mind array starts from 1 in R, but from 0 in Python.
  • R built-in datasets are also available with Python by installing pydataset package.

Machine Learning Introduction

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os                                                                                         

df = pd.read_csv('iris.csv')
df.head(5)

df.columns
# Output:  Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width', 'Species'], dtype='object')

df.shape    # Returns tuple (total_rows, total_columns)
df.ndim     # Total number of dimensions. For all data frames it is just 2

# Let us try to predict the last column, i.e. Species

X = dataset.iloc[:, :-1].values           # Everything except last column

y = dataset.iloc[:, 4].values             # Only the last column.

# Splitting the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

len(X_train)

# Feature Scaling
"""from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
sc_y = StandardScaler()
y_train = sc_y.fit_transform(y_train)"""