๐ ๏ธ 1. Setup & Environment
# Install packages
pip install pandas numpy matplotlib seaborn plotly scikit-learn jupyter
# Start Jupyter
jupyter notebook
๐ 2. Data Handling with Pandas
import pandas as pd
# Read data
df = pd.read_csv('data.csv') # Also: .xlsx, .json, .html, .sql
df.head(), df.tail()
df.shape, df.info(), df.describe()
# Selecting columns/rows
df['col'], df[['col1', 'col2']]
df.iloc[0], df.loc[0]
df[df['col'] > 100]
# Data cleaning
df.dropna(), df.fillna(0), df.drop_duplicates()
df['col'].replace({'N/A': None}, inplace=True)
df.columns = [c.lower().strip() for c in df.columns]
# Type conversion
df['date'] = pd.to_datetime(df['date'])
df['col'] = df['col'].astype(int)
# Feature engineering
df['new_col'] = df['col1'] / df['col2']
df['year'] = df['date'].dt.year
# Grouping & Aggregation
df.groupby('col').agg({'val': ['mean', 'sum']})
df.pivot_table(index='A', columns='B', values='C', aggfunc='sum')
๐ข 3. Numerical Computing with NumPy
import numpy as np
arr = np.array([1, 2, 3])
arr.shape, arr.dtype
arr.reshape(3, 1), arr.flatten()
# Operations
arr.mean(), arr.std(), np.median(arr)
np.dot(arr1, arr2)
np.where(arr > 2, 1, 0)
๐ 4. Visualization
Matplotlib
import matplotlib.pyplot as plt
plt.plot(df['x'], df['y'])
plt.bar(df['cat'], df['val'])
plt.hist(df['val'], bins=20)
plt.scatter(df['x'], df['y'])
plt.title("Title"), plt.xlabel("X"), plt.ylabel("Y")
plt.legend(), plt.grid(), plt.show()
Seaborn
import seaborn as sns
sns.boxplot(data=df, x='category', y='value')
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
sns.pairplot(df, hue='target')
Plotly (Interactive)
import plotly.express as px
px.scatter(df, x='x', y='y', color='label')
px.bar(df, x='category', y='value')
๐งน 5. Data Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder
# Standardize
scaler = StandardScaler()
df[['num1', 'num2']] = scaler.fit_transform(df[['num1', 'num2']])
# Encode categorical
le = LabelEncoder()
df['cat_encoded'] = le.fit_transform(df['category'])
# One-hot encoding
df = pd.get_dummies(df, columns=['category'])
๐ง 6. Machine Learning with Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
# Split
X = df.drop('target', axis=1)
y = df['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Train model
model = RandomForestClassifier()
model.fit(X_train, y_train)
# Predict & Evaluate
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
๐ 7. Exploratory Data Analysis (EDA)
df.describe(include='all')
df['target'].value_counts()
df.corr(numeric_only=True)
sns.heatmap(df.corr(), annot=True)
sns.pairplot(df)
๐งช 8. Statistical Analysis
from scipy import stats
# t-test
stats.ttest_ind(df[df['group'] == 'A']['score'], df[df['group'] == 'B']['score'])
# Chi-squared test
from scipy.stats import chi2_contingency
chi2_contingency(pd.crosstab(df['gender'], df['response']))
๐ 9. Working with Files & SQL
# CSV/Excel/JSON
df.to_csv('cleaned.csv', index=False)
df.to_excel('data.xlsx')
# SQL
import sqlite3
conn = sqlite3.connect('data.db')
df.to_sql('table_name', conn, if_exists='replace')
# Querying from SQL
pd.read_sql('SELECT * FROM table_name', conn)
๐ 10. Web Scraping (Bonus)
import requests
from bs4 import BeautifulSoup
url = "https://example.com"
res = requests.get(url)
soup = BeautifulSoup(res.text, 'html.parser')
titles = soup.find_all('h2')
๐ 11. Time Series Basics
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
df.resample('M').mean()
df['rolling_mean'] = df['value'].rolling(window=3).mean()
๐งฐ 12. Useful Utilities
# Progress bar
from tqdm import tqdm
for i in tqdm(range(1000)): ...
# Profiling
pip install pandas-profiling
from pandas_profiling import ProfileReport
profile = ProfileReport(df)
profile.to_file("eda_report.html")
๐ 13. Resources
- Pandas Docs: https://pandas.pydata.org/docs/
- Seaborn Gallery: https://seaborn.pydata.org/examples/index.html
- Scikit-learn User Guide: https://scikit-learn.org/stable/user_guide.html