Wednesday, May 3, 2023

Python Modules in Machine Learning

 1. Numpy

It is a module to work faster with numerical arrays. It is preinstalled for Anaconda(a python bundle).

import numpy as np


create ndarray

arr = np.array([1, 2, 3])     # from list

arr = np.array([[1, 2, 3], [4, 5, 6]])

arr = np.array((1, 3, 2))    #  from tuple

arr = np.arange(10)   #   using range

arr = np.arange(1,10,2)

arr = np.linspace(1, 19, 10)  # using linspace   # 19 included, 10 divisions

arr = np.zeros((3,2))

arr = np.random.rand(2,3)   # using random

np.array([1, 2, 3, 4], dtype='f'),    np.array([1, 2, 3, 4], ndmin=5)   


access

arr[1,2]

arr[:2, ::2]      #    first 2 rows and alternate columns(0 and 2)


ndarray operations

arr = np.array([[6,7,8,5],[4,6,7,1]])

print(arr.dtype) # int32

print(arr.ndim, arr.shape, arr.size)  # 2, (2,4), 8

print(arr+arr, arr-arr, arr*arr, arr>arr) # Element-wise operations

print(arr.T)  #   transpose

print(arr.sum(axis=0)) #   0 for column, 1 for row

print(arr.max(axis = 1)) # max in each row

print(arr.dot(arr.T))  # matrix multiplication

print(arr.reshape(2,2,2))  # to 3d

print(arr.flatten())   # to 1d

print(arr.astype('f'))

print(arr.copy())  # deep copy  # view() is for shallow copy, so changes are affected

print(arr[arr>6])    #  filter/subset


numpy operations

print(np.empty, np.zeros((3,2)), np.zeros_like(arr), np.ones((3,2)),np.identity(4), np.floor(arr))

print(np.dot(arr,arr.T))

print(np.add(arr,arr))

print(np.subtract(arr,arr))

print(np.multiply(arr,arr))

print(np.log(arr))

print(np.square(arr))

print(np.sqrt(arr))

print(np.sin(arr))

print(np.around(arr, 2))

for x in np.nditer(arr):   # to iterate through an n dimensional array

  print(x)

print(np.amin(arr))  # minimum

print(np.vstack((arr, arr)))  # added below  # 2 brackets

print(np.concatenate((arr,arr)))  # same as vstack

print(np.hsplit(arr, 2))  # split vertically

print(np.where(arr == 4))   # search index

print(np.sort(arr, axis = 0))

print(np.mean(arr,0), np.median(arr,0), np.percentile(arr,10))  # statistical functions

print(np.std(arr,axis=0)) # standard deviation 

print(np.var(arr)) # variance



2. Pandas

This module is commonly used for dataset management. It is preinstalled with Anaconda distribution. It has two data structures - series(1d) and dataframe(2d).

import pandas as pd


Create 

Series

lst = ['A', 'B', 'C', 'D', 'E']

ser = pd.Series(lst)            #  from list

ser = pd.Series(lst, index = ['i', 'j', 'k', 'l', 'm'])

dic = {1:'A', 2:'B', 3:'C', 4:'D', 5:'E'}      

ser = pd.Series(dic)     #  from dictionary (key becomes row index)

ser = np.array(['P','a','n','d','a','s'])    # from ndarray

Dataframe

A DataFrame has both row index and column index.

df = pd.DataFrame(lst, columns=['Alphabets'])    #   from list with column index

dic = {"calories": [420, 380, 390],

  "duration": [50, 40, 45] }

df = pd.DataFrame(dic, index = ["day1", "day2", "day3"])   #   from dictionary  #  value is list;   key is column index;     index is row index

df_tuple = [(1990, 'Italy','Germany', 115), (1994, 'USA', 'Brazil', 141)]

data = pd.DataFrame(df_tuple, columns=['Year','Country', 'Winner', 'Goals'])   #  from tuple  # 1 tuple element is one row.

df = pd.DataFrame({'Col1': pd.Series(lst1),'Col2': pd.Series(lst2)})   #  from Series

df = pd.read_csv('datasets/ted_data.csv')     #   from csv


Access

ser[0], ser['i':'k']     # using row location or row index

df[0:5], df['Country'], df[['Country', 'Year']]    # using row location or column index

df.loc[:, 'Country'],  df.loc['i:'j', 'Country':'Goals'] ,  df.loc[0:2, 'Country':'Goals']  # using row-column indices or row location-column index

df.iloc[:, 3], df.iloc[1,[2,3]]    #  using row-column locations.

df.at[0,'A'] is same as df.loc[0,'A']

Pandas Operations

pd.read_csv("file.csv")

pd.concat([df1, df2]),   pd.merge(df1,df2,on='city', how='left')

pd.get_dummies(pd.Series(list('abcaa')))     # creates a T/F table with columns a,b and c.

pd.isna(df)    #  creates a T/F  table for each cell

pd.to_datetime(df["Time"])

Series/Dataframe Operations

ser.value_counts(), ser.str

df.at[0,'A'] = 103    # setting

df['A'] = ser

df['class'].replace(['First','Second','Third'], value = [1,2,3], inplace=True)

df.dtypes, df.index, df.columns, df.T, len(df), df.to_csv("file.csv")

df.head(), df.tail(3), df.info(), df.describe(), df.mean(), df.mode(), df.median()

df.to_numpy(), df.copy(), df.dropna(how="all"), df.fillna(value=5)

df.sort_values(by="B", ascending=False), df[df['E']='e']

df.loc[df['usertype'] == 'Subscriber']          # where clause

df.groupby("A")[["C", "D"]].sum()         # group by and aggregate

df.drop('col_3', axis=1, inplace=True)     # drop column

df.apply(lambda x: x.max() - x.min())

df.plot(), df.unique(), df.nunique()  # each column , df.values.tolist(),   df.isnull().sum() 

for i,j in df.iterrows():      #   i is index, j is Series

    print(j['genre'])       #  j.index gives column name


3. Matplotlib

This is a graph plotting library.

import matplotlib.pyplot as plt

plt.plot([1,2,3,4,5],[1,4,9,16,25], color='r', linestyle='--', marker='>'#  line graph

plt.title("Simple Plot")

plt.ylabel("y-axis")

plt.xlabel("x-axis")

plt.legend()

plt.show() 

 

ypoints = np.array([3, 8, 1, 10])

plt.bar(['Blue', 'Grey', 'Yellow', 'White'], ypoints)    # bar graph   # barh for horizontal

# used when x values are categorical (supports numerical also)


plt.subplot(1, 2, 1)  # 1 row, 2 columns, 1st column

plt.scatter(x,y)   #    scatter/dot graph

plt.subplot(1, 2, 2)

plt.pie(y1)     # pie chart, used when there is no x


blood_sugar = [113,85,100,90,150,143,110,86,82]

plt.hist(blood_sugar,bins=[77,100,125,150])     # histogram,  used when x values are in ranges; y shows count


4. Seaborn

This library is built on top of Matlibplot. It uses Pandas datastructure.

import seaborn as sns

import matplotlib.pyplot as plt

df = sns.load_dataset('iris')

sns.lineplot(x="species", y="petal_length", data=df)

plt.title('Title using Matplotlib Function')  

plt.show()

sns.barplot(x="species", y="petal_length", data = df)

sns.scatterplot(x='species', y='petal_length', data=df)

sns.histplot(x='species', y='petal_length', data=df)

sns.catplot(x='species', y='petal_length', data=df)

sns.boxplot(x='species', y='petal_length', data=df)

sns.relplot(x='species', y='petal_length', data=df)

sns.displot(x='petal_length', data=df, kde=True)

sns.countplot(x='petal_length', data=df)

sns.heatmap(df.corr())

sns.pairplot(df)  # scatter + count


5. Scipy

This library is used for Scientific computations. It is built over Numpy.

from scipy.stats import binom

from scipy.stats import norm

from matplotlib import pyplot as plt

import numpy as np

# binomial distribution. 2 values eg: H, T

n,p = 2,.5  

x = np.array(range(0, n+1))

prob = np.array([binom.pmf(k, n, p) for k in x]) 

plt.xlabel('x')

plt.ylabel('Probability')

plt.bar(x, prob)

plt.show()

# normal distribution

n=50

x1 = np.array(range(0, n+2))

prob1 = np.array(norm.pdf(x1,np.mean(x1),np.std(x1))) 

plt.plot(x1, prob1)

plt.show()

x2 = np.array(range(0, n+2))

prob2 = np.array(norm.cdf(x2,np.mean(x2),np.std(x2))) # cumulative

plt.plot(x2, prob2)

plt.show()


6. statistics

statistics.mean(), statistics.median(), statistics.median_grouped(), statistics.mode(), statistics.pstdev(), statistics.stdev(), statistics.pvariance(), statistics.variance()