🐼 Data Science Python
Pandas & NumPy Complete Cheatsheet
Arrays, DataFrames, data cleaning, merging, groupby and visualization — complete data science toolkit.
📖 10 sections
⏱ 24 min read
✅ Quizzes included
🌙 Dark mode
01 NumPy Arrays
PYNumPy arrays
import numpy as np

# Create arrays
a = np.array([1, 2, 3])              # 1D from list
b = np.array([[1,2],[3,4]])          # 2D
c = np.zeros((3, 4))                 # 3x4 zeros
d = np.ones((2, 3), dtype=float)     # 2x3 ones
e = np.arange(0, 10, 2)             # [0,2,4,6,8]
f = np.linspace(0, 1, 5)            # [0, .25, .5, .75, 1]
g = np.random.rand(3, 3)            # 3x3 random 0-1
h = np.random.randn(100)            # 100 from normal dist
i = np.eye(3)                        # 3x3 identity matrix

# Shape info
a.shape    # (3,)
b.shape    # (2, 2)
b.ndim     # 2
b.size     # 4
b.dtype    # dtype('int64')

# Reshape
a.reshape(1, 3)              # (1, 3)
a.flatten()                  # always 1D copy
a.ravel()                    # 1D view (no copy)
💡
NumPy arrays are MUCH faster than Python lists for numerical operations. Operations are vectorized — no slow Python for-loops.
02 Array Operations
PYArray math and indexing
import numpy as np
a = np.array([1, 2, 3, 4, 5])
b = np.array([10, 20, 30, 40, 50])

# Element-wise math
a + b         # [11, 22, 33, 44, 55]
a * 2         # [2, 4, 6, 8, 10]
a ** 2        # [1, 4, 9, 16, 25]
np.sqrt(a)    # square root
np.log(a)     # natural log
np.exp(a)     # e^x
np.dot(a, b)  # dot product = 550

# Statistics
np.mean(a)    # 3.0
np.median(a)  # 3.0
np.std(a)     # 1.41
np.sum(a)     # 15
np.min(a), np.max(a)   # 1, 5
np.argmin(a)           # index of min = 0
np.cumsum(a)           # [1, 3, 6, 10, 15]

# Indexing and slicing
m = np.array([[1,2,3],[4,5,6],[7,8,9]])
m[1, 2]      # 6 (row 1, col 2)
m[0:2, 1:]   # rows 0-1, cols 1+
m[:, 1]      # all rows, col 1 = [2,5,8]
m[m > 4]     # boolean indexing = [5,6,7,8,9]
03 Pandas Series
PYPandas Series
import pandas as pd

# Create Series
s = pd.Series([10, 20, 30, 40], index=['a','b','c','d'])
s2 = pd.Series({'x': 1, 'y': 2, 'z': 3})

# Access
s['a']         # 10
s[['a','c']]   # select multiple
s[s > 15]      # boolean filter = [20, 30, 40]

# Methods
s.mean()       # 25.0
s.sum()        # 100
s.describe()   # count, mean, std, min, quartiles, max
s.value_counts()  # frequency count
s.sort_values()   # sort ascending
s.isnull()        # boolean mask of NaN values
s.fillna(0)       # replace NaN with 0
s.dropna()        # remove NaN rows

# Apply function
s.apply(lambda x: x * 2)
s.map({'a': 'Alpha', 'b': 'Beta'})  # map values
04 DataFrame Basics
PYDataFrame basics
import pandas as pd

df = pd.read_csv('data.csv')
df = pd.read_excel('data.xlsx')
df = pd.DataFrame({'name': ['Ali','Sara'], 'age': [22, 25]})

# Explore
df.head(5)          # first 5 rows
df.tail(5)          # last 5 rows
df.info()           # dtypes + null counts
df.describe()       # statistics for numeric cols
df.shape            # (rows, cols)
df.columns          # column names
df.dtypes           # data types
df.index            # row labels

# Select
df['name']               # single column (Series)
df[['name', 'age']]      # multiple columns (DataFrame)
df.iloc[0]               # first row by position
df.iloc[0:5, 1:3]        # rows 0-4, cols 1-2 by position
df.loc[0, 'name']        # by label
df.loc[0:5, ['name']]    # by label range

# Modify
df['salary'] = 50000     # add column
df['age'] = df['age'] + 1  # modify column
del df['column']         # delete column
df.drop('col', axis=1)   # drop column (non-destructive)
💡
iloc = integer location (position-based). loc = label-based. Use iloc for positional slicing, loc for label-based filtering.
05 Data Cleaning
PYData cleaning
import pandas as pd

# Missing data
df.isnull().sum()          # count nulls per column
df.isnull().any()          # any nulls per column
df.dropna()                # drop rows with any null
df.dropna(subset=['age'])  # drop if 'age' is null
df.dropna(how='all')       # drop only if ALL are null
df.fillna(0)               # fill all nulls with 0
df.fillna({'age': df['age'].mean(), 'name': 'Unknown'})
df['age'].fillna(df['age'].median(), inplace=True)
df.interpolate()           # interpolate numeric nulls

# Duplicates
df.duplicated().sum()      # count duplicate rows
df.drop_duplicates()       # remove duplicates
df.drop_duplicates(subset=['email'])  # on specific cols

# Data types
df['age'] = df['age'].astype(int)
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')
df['price'] = df['price'].str.replace('$','').astype(float)

# String cleaning
df['name'] = df['name'].str.strip().str.lower()
df['phone'] = df['phone'].str.replace(r'[^0-9]', '', regex=True)
⚠️
inplace=True modifies the DataFrame directly. Without it, you must reassign: df = df.dropna()
06 Filtering & Querying
PYFiltering and querying
import pandas as pd

# Boolean filtering
df[df['age'] > 25]
df[(df['age'] > 18) & (df['country'] == 'PK')]
df[(df['status'] == 'active') | (df['role'] == 'admin')]
df[~df['email'].str.contains('@gmail.com')]  # NOT

# isin
df[df['country'].isin(['PK', 'IN', 'NG'])]
df[~df['role'].isin(['admin', 'superuser'])]

# String methods
df[df['name'].str.startswith('A')]
df[df['email'].str.contains('@gmail')]
df[df['name'].str.len() > 5]

# query() method (SQL-like)
df.query('age > 18 and country == "PK"')
df.query('age > @threshold')  # use Python variable with @

# Between
df[df['age'].between(18, 30)]

# Top/bottom N
df.nlargest(5, 'salary')   # top 5 by salary
df.nsmallest(3, 'age')     # bottom 3 by age
07 GroupBy & Aggregation
PYGroupBy and aggregation
import pandas as pd

# Basic groupby
df.groupby('country')['salary'].mean()
df.groupby('country')['salary'].agg(['mean', 'max', 'count'])

# Multiple groupby keys
df.groupby(['country', 'role'])['salary'].mean()

# agg with dict
df.groupby('country').agg({
    'salary': ['mean', 'max', 'min'],
    'age': 'mean',
    'user_id': 'count'
})

# Named aggregations (Pandas 0.25+)
df.groupby('country').agg(
    avg_salary=('salary', 'mean'),
    max_salary=('salary', 'max'),
    user_count=('user_id', 'count')
)

# Transform (keep original shape)
df['rank_in_country'] = df.groupby('country')['salary'].rank(ascending=False)
df['country_avg'] = df.groupby('country')['salary'].transform('mean')

# Pivot table
pd.pivot_table(df, values='salary', index='country',
               columns='role', aggfunc='mean', fill_value=0)
08 Merging & Joining
PYMerging and reshaping
import pandas as pd

# Merge (like SQL JOIN)
pd.merge(df1, df2, on='user_id')             # inner join
pd.merge(df1, df2, on='user_id', how='left') # left join
pd.merge(df1, df2, left_on='id', right_on='user_id')

# Concat (stack vertically or horizontally)
pd.concat([df1, df2], ignore_index=True)     # vertical stack
pd.concat([df1, df2], axis=1)                # horizontal

# Stack / unstack (multi-index)
df.set_index(['country', 'year']).unstack(level=1)

# Melt (wide to long)
pd.melt(df, id_vars=['name'], value_vars=['q1','q2','q3'],
        var_name='quarter', value_name='revenue')

# Pivot (long to wide)
df.pivot(index='name', columns='quarter', values='revenue')

# Sample and shuffle
df.sample(100)                # random 100 rows
df.sample(frac=0.2)           # random 20%
df.sample(frac=1).reset_index(drop=True)  # shuffle
09 Visualization
PYQuick visualization
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Pandas built-in
df['age'].hist(bins=20)
df['country'].value_counts().plot(kind='bar')
df.plot(x='date', y='revenue', kind='line')
df.plot(kind='scatter', x='age', y='salary')
df.boxplot(column='salary', by='country')

# Matplotlib
fig, ax = plt.subplots(1, 2, figsize=(12, 5))
ax[0].plot(x, y, color='blue', linewidth=2, label='Revenue')
ax[0].set_title('Revenue Over Time')
ax[0].legend()
ax[1].scatter(df['age'], df['salary'], alpha=0.5)
plt.tight_layout()
plt.savefig('output.png', dpi=300)
plt.show()

# Seaborn (higher-level)
sns.histplot(df['age'], kde=True)
sns.boxplot(x='country', y='salary', data=df)
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
sns.pairplot(df[['age', 'salary', 'score']])
10 Mini Quizzes
❓ Quiz 1
What is the difference between iloc and loc in Pandas?
df.iloc[0:5] selects by integer position (like Python slicing). df.loc['a':'c'] selects by label/index. With numeric index they overlap, but with string index they differ.
❓ Quiz 2
What does groupby().transform() do vs groupby().agg()?
agg reduces to one row per group (like GROUP BY in SQL). transform keeps the original shape — useful for adding group statistics back to the original DataFrame.