🐼 Data Science Python
Pandas & NumPy Complete Cheatsheet
Arrays, DataFrames, data cleaning, merging, groupby and visualization — complete data science toolkit.
01
NumPy Arrays
▼
PYNumPy arrays
import numpy as np
# Create arrays
a = np.array([1, 2, 3]) # 1D from list
b = np.array([[1,2],[3,4]]) # 2D
c = np.zeros((3, 4)) # 3x4 zeros
d = np.ones((2, 3), dtype=float) # 2x3 ones
e = np.arange(0, 10, 2) # [0,2,4,6,8]
f = np.linspace(0, 1, 5) # [0, .25, .5, .75, 1]
g = np.random.rand(3, 3) # 3x3 random 0-1
h = np.random.randn(100) # 100 from normal dist
i = np.eye(3) # 3x3 identity matrix
# Shape info
a.shape # (3,)
b.shape # (2, 2)
b.ndim # 2
b.size # 4
b.dtype # dtype('int64')
# Reshape
a.reshape(1, 3) # (1, 3)
a.flatten() # always 1D copy
a.ravel() # 1D view (no copy)
💡
NumPy arrays are MUCH faster than Python lists for numerical operations. Operations are vectorized — no slow Python for-loops.
02
Array Operations
▼
PYArray math and indexing
import numpy as np a = np.array([1, 2, 3, 4, 5]) b = np.array([10, 20, 30, 40, 50]) # Element-wise math a + b # [11, 22, 33, 44, 55] a * 2 # [2, 4, 6, 8, 10] a ** 2 # [1, 4, 9, 16, 25] np.sqrt(a) # square root np.log(a) # natural log np.exp(a) # e^x np.dot(a, b) # dot product = 550 # Statistics np.mean(a) # 3.0 np.median(a) # 3.0 np.std(a) # 1.41 np.sum(a) # 15 np.min(a), np.max(a) # 1, 5 np.argmin(a) # index of min = 0 np.cumsum(a) # [1, 3, 6, 10, 15] # Indexing and slicing m = np.array([[1,2,3],[4,5,6],[7,8,9]]) m[1, 2] # 6 (row 1, col 2) m[0:2, 1:] # rows 0-1, cols 1+ m[:, 1] # all rows, col 1 = [2,5,8] m[m > 4] # boolean indexing = [5,6,7,8,9]
03
Pandas Series
▼
PYPandas Series
import pandas as pd
# Create Series
s = pd.Series([10, 20, 30, 40], index=['a','b','c','d'])
s2 = pd.Series({'x': 1, 'y': 2, 'z': 3})
# Access
s['a'] # 10
s[['a','c']] # select multiple
s[s > 15] # boolean filter = [20, 30, 40]
# Methods
s.mean() # 25.0
s.sum() # 100
s.describe() # count, mean, std, min, quartiles, max
s.value_counts() # frequency count
s.sort_values() # sort ascending
s.isnull() # boolean mask of NaN values
s.fillna(0) # replace NaN with 0
s.dropna() # remove NaN rows
# Apply function
s.apply(lambda x: x * 2)
s.map({'a': 'Alpha', 'b': 'Beta'}) # map values
04
DataFrame Basics
▼
PYDataFrame basics
import pandas as pd
df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx')
df = pd.DataFrame({'name': ['Ali','Sara'], 'age': [22, 25]})
# Explore
df.head(5) # first 5 rows
df.tail(5) # last 5 rows
df.info() # dtypes + null counts
df.describe() # statistics for numeric cols
df.shape # (rows, cols)
df.columns # column names
df.dtypes # data types
df.index # row labels
# Select
df['name'] # single column (Series)
df[['name', 'age']] # multiple columns (DataFrame)
df.iloc[0] # first row by position
df.iloc[0:5, 1:3] # rows 0-4, cols 1-2 by position
df.loc[0, 'name'] # by label
df.loc[0:5, ['name']] # by label range
# Modify
df['salary'] = 50000 # add column
df['age'] = df['age'] + 1 # modify column
del df['column'] # delete column
df.drop('col', axis=1) # drop column (non-destructive)
💡
iloc = integer location (position-based). loc = label-based. Use iloc for positional slicing, loc for label-based filtering.
05
Data Cleaning
▼
PYData cleaning
import pandas as pd
# Missing data
df.isnull().sum() # count nulls per column
df.isnull().any() # any nulls per column
df.dropna() # drop rows with any null
df.dropna(subset=['age']) # drop if 'age' is null
df.dropna(how='all') # drop only if ALL are null
df.fillna(0) # fill all nulls with 0
df.fillna({'age': df['age'].mean(), 'name': 'Unknown'})
df['age'].fillna(df['age'].median(), inplace=True)
df.interpolate() # interpolate numeric nulls
# Duplicates
df.duplicated().sum() # count duplicate rows
df.drop_duplicates() # remove duplicates
df.drop_duplicates(subset=['email']) # on specific cols
# Data types
df['age'] = df['age'].astype(int)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['price'] = df['price'].str.replace('$','').astype(float)
# String cleaning
df['name'] = df['name'].str.strip().str.lower()
df['phone'] = df['phone'].str.replace(r'[^0-9]', '', regex=True)
⚠️
inplace=True modifies the DataFrame directly. Without it, you must reassign: df = df.dropna()
06
Filtering & Querying
▼
PYFiltering and querying
import pandas as pd
# Boolean filtering
df[df['age'] > 25]
df[(df['age'] > 18) & (df['country'] == 'PK')]
df[(df['status'] == 'active') | (df['role'] == 'admin')]
df[~df['email'].str.contains('@gmail.com')] # NOT
# isin
df[df['country'].isin(['PK', 'IN', 'NG'])]
df[~df['role'].isin(['admin', 'superuser'])]
# String methods
df[df['name'].str.startswith('A')]
df[df['email'].str.contains('@gmail')]
df[df['name'].str.len() > 5]
# query() method (SQL-like)
df.query('age > 18 and country == "PK"')
df.query('age > @threshold') # use Python variable with @
# Between
df[df['age'].between(18, 30)]
# Top/bottom N
df.nlargest(5, 'salary') # top 5 by salary
df.nsmallest(3, 'age') # bottom 3 by age
07
GroupBy & Aggregation
▼
PYGroupBy and aggregation
import pandas as pd
# Basic groupby
df.groupby('country')['salary'].mean()
df.groupby('country')['salary'].agg(['mean', 'max', 'count'])
# Multiple groupby keys
df.groupby(['country', 'role'])['salary'].mean()
# agg with dict
df.groupby('country').agg({
'salary': ['mean', 'max', 'min'],
'age': 'mean',
'user_id': 'count'
})
# Named aggregations (Pandas 0.25+)
df.groupby('country').agg(
avg_salary=('salary', 'mean'),
max_salary=('salary', 'max'),
user_count=('user_id', 'count')
)
# Transform (keep original shape)
df['rank_in_country'] = df.groupby('country')['salary'].rank(ascending=False)
df['country_avg'] = df.groupby('country')['salary'].transform('mean')
# Pivot table
pd.pivot_table(df, values='salary', index='country',
columns='role', aggfunc='mean', fill_value=0)
08
Merging & Joining
▼
PYMerging and reshaping
import pandas as pd
# Merge (like SQL JOIN)
pd.merge(df1, df2, on='user_id') # inner join
pd.merge(df1, df2, on='user_id', how='left') # left join
pd.merge(df1, df2, left_on='id', right_on='user_id')
# Concat (stack vertically or horizontally)
pd.concat([df1, df2], ignore_index=True) # vertical stack
pd.concat([df1, df2], axis=1) # horizontal
# Stack / unstack (multi-index)
df.set_index(['country', 'year']).unstack(level=1)
# Melt (wide to long)
pd.melt(df, id_vars=['name'], value_vars=['q1','q2','q3'],
var_name='quarter', value_name='revenue')
# Pivot (long to wide)
df.pivot(index='name', columns='quarter', values='revenue')
# Sample and shuffle
df.sample(100) # random 100 rows
df.sample(frac=0.2) # random 20%
df.sample(frac=1).reset_index(drop=True) # shuffle
09
Visualization
▼
PYQuick visualization
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Pandas built-in
df['age'].hist(bins=20)
df['country'].value_counts().plot(kind='bar')
df.plot(x='date', y='revenue', kind='line')
df.plot(kind='scatter', x='age', y='salary')
df.boxplot(column='salary', by='country')
# Matplotlib
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ax[0].plot(x, y, color='blue', linewidth=2, label='Revenue')
ax[0].set_title('Revenue Over Time')
ax[0].legend()
ax[1].scatter(df['age'], df['salary'], alpha=0.5)
plt.tight_layout()
plt.savefig('output.png', dpi=300)
plt.show()
# Seaborn (higher-level)
sns.histplot(df['age'], kde=True)
sns.boxplot(x='country', y='salary', data=df)
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
sns.pairplot(df[['age', 'salary', 'score']])
10
Mini Quizzes
▼
❓ Quiz 1
What is the difference between iloc and loc in Pandas?
df.iloc[0:5] selects by integer position (like Python slicing). df.loc['a':'c'] selects by label/index. With numeric index they overlap, but with string index they differ.
❓ Quiz 2
What does groupby().transform() do vs groupby().agg()?
agg reduces to one row per group (like GROUP BY in SQL). transform keeps the original shape — useful for adding group statistics back to the original DataFrame.