1. Numpy
It is a module to work faster with numerical arrays. It is preinstalled for Anaconda(a python bundle).
import numpy as np
create ndarray
arr = np.array([1, 2, 3]) # from list
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr = np.array((1, 3, 2)) # from tuple
arr = np.arange(10) # using range
arr = np.arange(1,10,2)
arr = np.linspace(1, 19, 10) # using linspace # 19 included, 10 divisions
arr = np.zeros((3,2))
arr = np.random.rand(2,3) # using random
np.array([1, 2, 3, 4], dtype='f'), np.array([1, 2, 3, 4], ndmin=5)
access
arr[1,2]
arr[:2, ::2] # first 2 rows and alternate columns(0 and 2)
ndarray operations
arr = np.array([[6,7,8,5],[4,6,7,1]])
print(arr.dtype) # int32
print(arr.ndim, arr.shape, arr.size) # 2, (2,4), 8
print(arr+arr, arr-arr, arr*arr, arr>arr) # Element-wise operations
print(arr.T) # transpose
print(arr.sum(axis=0)) # 0 for column, 1 for row
print(arr.max(axis = 1)) # max in each row
print(arr.dot(arr.T)) # matrix multiplication
print(arr.reshape(2,2,2)) # to 3d
print(arr.flatten()) # to 1d
print(arr.astype('f'))
print(arr.copy()) # deep copy # view() is for shallow copy, so changes are affected
print(arr[arr>6]) # filter/subset
numpy operations
print(np.empty, np.zeros((3,2)), np.zeros_like(arr), np.ones((3,2)),np.identity(4), np.floor(arr))
print(np.dot(arr,arr.T))
print(np.add(arr,arr))
print(np.subtract(arr,arr))
print(np.multiply(arr,arr))
print(np.log(arr))
print(np.square(arr))
print(np.sqrt(arr))
print(np.sin(arr))
print(np.around(arr, 2))
for x in np.nditer(arr): # to iterate through an n dimensional array
print(x)
print(np.amin(arr)) # minimum
print(np.vstack((arr, arr))) # added below # 2 brackets
print(np.concatenate((arr,arr))) # same as vstack
print(np.hsplit(arr, 2)) # split vertically
print(np.where(arr == 4)) # search index
print(np.sort(arr, axis = 0))
print(np.mean(arr,0), np.median(arr,0), np.percentile(arr,10)) # statistical functions
print(np.std(arr,axis=0)) # standard deviation
print(np.var(arr)) # variance
2. Pandas
This module is commonly used for dataset management. It is preinstalled with Anaconda distribution. It has two data structures - series(1d) and dataframe(2d).
import pandas as pd
Create
Series
lst = ['A', 'B', 'C', 'D', 'E']
ser = pd.Series(lst) # from list
ser = pd.Series(lst, index = ['i', 'j', 'k', 'l', 'm'])
dic = {1:'A', 2:'B', 3:'C', 4:'D', 5:'E'}
ser = pd.Series(dic) # from dictionary (key becomes row index)
ser = np.array(['P','a','n','d','a','s']) # from ndarray
Dataframe
A DataFrame has both row index and column index.
df = pd.DataFrame(lst, columns=['Alphabets']) # from list with column index
dic = {"calories": [420, 380, 390],
"duration": [50, 40, 45] }
df = pd.DataFrame(dic, index = ["day1", "day2", "day3"]) # from dictionary # value is list; key is column index; index is row index
df_tuple = [(1990, 'Italy','Germany', 115), (1994, 'USA', 'Brazil', 141)]
data = pd.DataFrame(df_tuple, columns=['Year','Country', 'Winner', 'Goals']) # from tuple # 1 tuple element is one row.
df = pd.DataFrame({'Col1': pd.Series(lst1),'Col2': pd.Series(lst2)}) # from Series
df = pd.read_csv('datasets/ted_data.csv') # from csv
Access
ser[0], ser['i':'k'] # using row location or row index
df[0:5], df['Country'], df[['Country', 'Year']] # using row location or column index
df.loc[:, 'Country'], df.loc['i:'j', 'Country':'Goals'] , df.loc[0:2, 'Country':'Goals'] # using row-column indices or row location-column index
df.iloc[:, 3], df.iloc[1,[2,3]] # using row-column locations.
df.at[0,'A'] is same as df.loc[0,'A']
Pandas Operationspd.read_csv("file.csv")
pd.concat([df1, df2]), pd.merge(df1,df2,on='city', how='left')
pd.get_dummies(pd.Series(list('abcaa'))) # creates a T/F table with columns a,b and c.
pd.isna(df) # creates a T/F table for each cell
pd.to_datetime(df["Time"])
Series/Dataframe Operations
ser.value_counts(), ser.str
df.at[0,'A'] = 103 # setting
df['A'] = ser
df['class'].replace(['First','Second','Third'], value = [1,2,3], inplace=True)
df.dtypes, df.index, df.columns, df.T, len(df), df.to_csv("file.csv")
df.head(), df.tail(3), df.info(), df.describe(), df.mean(), df.mode(), df.median()
df.to_numpy(), df.copy(), df.dropna(how="all"), df.fillna(value=5)
df.sort_values(by="B", ascending=False), df[df['E']='e']
df.loc[df['usertype'] == 'Subscriber'] # where clause
df.groupby("A")[["C", "D"]].sum() # group by and aggregate
df.drop('col_3', axis=1, inplace=True) # drop column
df.apply(lambda x: x.max() - x.min())
df.plot(), df.unique(), df.nunique() # each column , df.values.tolist(), df.isnull().sum()
for i,j in df.iterrows(): # i is index, j is Series
print(j['genre']) # j.index gives column name
3. Matplotlib
This is a graph plotting library.
import matplotlib.pyplot as plt
plt.plot([1,2,3,4,5],[1,4,9,16,25], color='r', linestyle='--', marker='>') # line graph
plt.title("Simple Plot")
plt.ylabel("y-axis")
plt.xlabel("x-axis")
plt.legend()
plt.show()
ypoints = np.array([3, 8, 1, 10])
plt.bar(['Blue', 'Grey', 'Yellow', 'White'], ypoints) # bar graph # barh for horizontal
# used when x values are categorical (supports numerical also)
plt.subplot(1, 2, 1) # 1 row, 2 columns, 1st column
plt.scatter(x,y) # scatter/dot graph
plt.subplot(1, 2, 2)
plt.pie(y1) # pie chart, used when there is no x
blood_sugar = [113,85,100,90,150,143,110,86,82]
plt.hist(blood_sugar,bins=[77,100,125,150]) # histogram, used when x values are in ranges; y shows count
4. Seaborn
This library is built on top of Matlibplot. It uses Pandas datastructure.
import seaborn as sns
import matplotlib.pyplot as plt
df = sns.load_dataset('iris')
sns.lineplot(x="species", y="petal_length", data=df)
plt.title('Title using Matplotlib Function')
plt.show()
sns.barplot(x="species", y="petal_length", data = df)
sns.scatterplot(x='species', y='petal_length', data=df)
sns.histplot(x='species', y='petal_length', data=df)
sns.catplot(x='species', y='petal_length', data=df)
sns.boxplot(x='species', y='petal_length', data=df)
sns.relplot(x='species', y='petal_length', data=df)
sns.displot(x='petal_length', data=df, kde=True)
sns.countplot(x='petal_length', data=df)
sns.heatmap(df.corr())
sns.pairplot(df) # scatter + count
5. Scipy
This library is used for Scientific computations. It is built over Numpy.
from scipy.stats import binom
from scipy.stats import norm
from matplotlib import pyplot as plt
import numpy as np
# binomial distribution. 2 values eg: H, T
n,p = 2,.5
x = np.array(range(0, n+1))
prob = np.array([binom.pmf(k, n, p) for k in x])
plt.xlabel('x')
plt.ylabel('Probability')
plt.bar(x, prob)
plt.show()
# normal distribution
n=50
x1 = np.array(range(0, n+2))
prob1 = np.array(norm.pdf(x1,np.mean(x1),np.std(x1)))
plt.plot(x1, prob1)
plt.show()
x2 = np.array(range(0, n+2))
prob2 = np.array(norm.cdf(x2,np.mean(x2),np.std(x2))) # cumulative
plt.plot(x2, prob2)
plt.show()
No comments:
Post a Comment