Skip to content

Latest commit

 

History

History
381 lines (286 loc) · 13.7 KB

Addressing Bias and Improving Model Performance.md

File metadata and controls

381 lines (286 loc) · 13.7 KB

Addressing Bias and Improving Model Performance

Slide 1: Understanding Class Imbalance

Class imbalance occurs when the distribution of classes in a dataset is significantly skewed. This fundamental challenge in machine learning can severely impact model performance, particularly in critical applications like fraud detection or medical diagnosis where minority classes are often the most important.

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification

# Generate imbalanced dataset
X, y = make_classification(n_samples=1000, n_classes=2, weights=[0.9, 0.1],
                         n_features=3, random_state=42)

# Display class distribution
unique, counts = np.unique(y, return_counts=True)
print("Class distribution:", dict(zip(unique, counts)))
print("Imbalance ratio:", max(counts) / min(counts))

# Output:
# Class distribution: {0: 900, 1: 100}
# Imbalance ratio: 9.0

Slide 2: Random Oversampling Implementation

Random oversampling duplicates existing minority class samples to achieve balance. While simple, this technique requires careful implementation to avoid overfitting and ensure proper cross-validation splitting to prevent data leakage.

from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply random oversampling
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train, y_train)

print("Original training set shape:", np.bincount(y_train))
print("Resampled training set shape:", np.bincount(y_train_ros))

# Output:
# Original training set shape: [720  80]
# Resampled training set shape: [720 720]

Slide 3: SMOTE (Synthetic Minority Over-sampling Technique)

SMOTE creates synthetic samples for the minority class by interpolating between existing instances. This advanced oversampling technique generates new samples along the line segments joining nearest neighbors of minority class instances.

from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Visualize first two features before and after SMOTE
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))

ax1.scatter(X_train[y_train==0, 0], X_train[y_train==0, 1], label='Majority')
ax1.scatter(X_train[y_train==1, 0], X_train[y_train==1, 1], label='Minority')
ax1.set_title('Original Data')

ax2.scatter(X_train_smote[y_train_smote==0, 0], 
           X_train_smote[y_train_smote==0, 1], label='Majority')
ax2.scatter(X_train_smote[y_train_smote==1, 0], 
           X_train_smote[y_train_smote==1, 1], label='Minority')
ax2.set_title('SMOTE Applied')

plt.tight_layout()

Slide 4: Random Undersampling Strategy

Random undersampling reduces majority class samples to match minority class frequency. This technique helps balance datasets by randomly removing majority class instances, though it risks losing potentially important information from the discarded samples.

from imblearn.under_sampling import RandomUnderSampler

# Apply random undersampling
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

# Compare distributions
print("Original distribution:", np.bincount(y_train))
print("Undersampled distribution:", np.bincount(y_train_rus))

# Calculate information loss
info_loss = 1 - len(y_train_rus) / len(y_train)
print(f"Information loss: {info_loss:.2%}")

# Output:
# Original distribution: [720  80]
# Undersampled distribution: [80 80]
# Information loss: 77.50%

Slide 5: Tomek Links Undersampling

Tomek links identify pairs of samples from different classes that are nearest neighbors. This method removes majority class samples from Tomek links to create a cleaner decision boundary and reduce class overlap.

from imblearn.under_sampling import TomekLinks

# Apply Tomek links
tl = TomekLinks()
X_train_tl, y_train_tl = tl.fit_resample(X_train, y_train)

# Calculate removed samples
removed = len(y_train) - len(y_train_tl)
print(f"Original samples: {len(y_train)}")
print(f"Samples after Tomek links: {len(y_train_tl)}")
print(f"Removed samples: {removed}")
print("Class distribution after Tomek:", np.bincount(y_train_tl))

# Output:
# Original samples: 800
# Samples after Tomek links: 776
# Removed samples: 24
# Class distribution after Tomek: [696  80]

Slide 6: Model Evaluation for Imbalanced Data

Traditional accuracy metrics can be misleading for imbalanced datasets. Proper evaluation requires metrics like precision, recall, F1-score, and ROC-AUC that account for class imbalance and provide more meaningful performance assessment.

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Train and evaluate model with different sampling techniques
def evaluate_model(X_train_sampled, y_train_sampled, X_test, y_test):
    clf = RandomForestClassifier(random_state=42)
    clf.fit(X_train_sampled, y_train_sampled)
    y_pred = clf.predict(X_test)
    
    # Generate comprehensive metrics
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred):.3f}")

# Evaluate original imbalanced data
evaluate_model(X_train, y_train, X_test, y_test)

# Evaluate SMOTE-balanced data
evaluate_model(X_train_smote, y_train_smote, X_test, y_test)

# Output Example:
# Original Data:
# Classification Report:
#               precision    recall  f1-score   support
#            0       0.90      0.98      0.94       180
#            1       0.75      0.35      0.48        20
# ROC-AUC Score: 0.665

# SMOTE-balanced Data:
#               precision    recall  f1-score   support
#            0       0.95      0.96      0.95       180
#            1       0.71      0.65      0.68        20
# ROC-AUC Score: 0.805

Slide 7: Implementing Combined Sampling Techniques

Combined sampling techniques leverage both oversampling and undersampling to achieve optimal balance. This approach helps maintain important majority class information while ensuring minority class representation.

from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

# Create combined sampling pipeline
combined_sampler = SMOTETomek(random_state=42)
X_train_combined, y_train_combined = combined_sampler.fit_resample(X_train, y_train)

# Display class distribution
print("Original distribution:", np.bincount(y_train))
print("Combined sampling distribution:", np.bincount(y_train_combined))

# Compute sampling ratios
original_ratio = np.bincount(y_train)[0] / np.bincount(y_train)[1]
new_ratio = np.bincount(y_train_combined)[0] / np.bincount(y_train_combined)[1]
print(f"Original imbalance ratio: {original_ratio:.2f}")
print(f"New imbalance ratio: {new_ratio:.2f}")

Slide 8: Cost-Sensitive Learning Implementation

Cost-sensitive learning assigns different misclassification costs to different classes, effectively penalizing errors on minority classes more heavily during model training.

from sklearn.ensemble import RandomForestClassifier

# Calculate class weights inversely proportional to frequency
class_weights = dict(zip(
    np.unique(y_train),
    len(y_train) / (len(np.unique(y_train)) * np.bincount(y_train))
))

# Train cost-sensitive model
cost_sensitive_clf = RandomForestClassifier(
    class_weight=class_weights,
    random_state=42
)

# Fit and predict
cost_sensitive_clf.fit(X_train, y_train)
y_pred_weighted = cost_sensitive_clf.predict(X_test)

print("Class weights:", class_weights)
print("\nPrediction distribution:", np.bincount(y_pred_weighted))

Slide 9: Real-World Application - Credit Card Fraud Detection

Implementing imbalanced learning techniques for credit card fraud detection, where fraudulent transactions typically represent less than 1% of all transactions.

import pandas as pd
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score

# Simulate credit card transaction dataset
np.random.seed(42)
n_samples = 10000
fraud_ratio = 0.01

# Generate transaction features
X = np.random.randn(n_samples, 4)  # Amount, time, V1, V2
y = np.random.choice([0, 1], size=n_samples, p=[1-fraud_ratio, fraud_ratio])

# Create preprocessing and modeling pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('sampler', SMOTE(random_state=42)),
    ('classifier', RandomForestClassifier(class_weight='balanced'))
])

# Evaluate using cross-validation
scores = cross_val_score(pipeline, X, y, cv=5, scoring='roc_auc')
print(f"ROC-AUC scores: {scores.mean():.3f} (+/- {scores.std() * 2:.3f})")

Slide 10: Custom Sampling Strategy Implementation

Developing a custom sampling strategy that combines multiple techniques and allows for fine-tuned control over the resampling process based on domain knowledge.

from imblearn.base import BaseSampler
from collections import Counter

class CustomSampler(BaseSampler):
    def __init__(self, sampling_strategy='auto', random_state=None):
        super().__init__(sampling_strategy=sampling_strategy)
        self.random_state = random_state
    
    def _fit_resample(self, X, y):
        # Get class distribution
        counter = Counter(y)
        
        # Calculate desired ratio (example: 1:2 minority:majority)
        target_ratio = 0.5
        n_minority = counter[1]
        n_majority = int(n_minority / target_ratio)
        
        # Perform custom sampling
        majority_indices = np.where(y == 0)[0]
        minority_indices = np.where(y == 1)[0]
        
        # Random selection for majority class
        np.random.seed(self.random_state)
        selected_majority = np.random.choice(
            majority_indices, 
            size=n_majority, 
            replace=False
        )
        
        # Combine indices
        selected_indices = np.concatenate([selected_majority, minority_indices])
        
        return X[selected_indices], y[selected_indices]

# Usage example
custom_sampler = CustomSampler(random_state=42)
X_resampled, y_resampled = custom_sampler.fit_resample(X, y)
print("Original distribution:", Counter(y))
print("Resampled distribution:", Counter(y_resampled))

Slide 11: Performance Monitoring and Validation

Implementing a comprehensive validation strategy for imbalanced learning, including cross-validation with stratification and performance monitoring across different sampling techniques.

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_recall_curve
import numpy as np

def validate_sampling_strategy(X, y, sampler, classifier, n_splits=5):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y):
        # Split data
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        
        # Apply sampling
        X_train_resampled, y_train_resampled = sampler.fit_resample(X_train, y_train)
        
        # Train and evaluate
        classifier.fit(X_train_resampled, y_train_resampled)
        y_pred_proba = classifier.predict_proba(X_val)[:, 1]
        
        # Calculate precision-recall curve
        precision, recall, _ = precision_recall_curve(y_val, y_pred_proba)
        scores.append({'precision': precision, 'recall': recall})
    
    return scores

# Example usage
from imblearn.over_sampling import ADASYN
sampler = ADASYN(random_state=42)
clf = RandomForestClassifier(random_state=42)
validation_scores = validate_sampling_strategy(X, y, sampler, clf)

Slide 12: Ensemble Methods for Imbalanced Learning

Implementing ensemble methods specifically designed for imbalanced datasets, combining multiple sampling techniques with different base classifiers to improve overall performance.

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.ensemble import VotingClassifier

class ImbalancedEnsemble(BaseEstimator, ClassifierMixin):
    def __init__(self, samplers, base_classifier, n_estimators=5):
        self.samplers = samplers
        self.base_classifier = base_classifier
        self.n_estimators = n_estimators
        self.classifiers = []
        
    def fit(self, X, y):
        self.classifiers = []
        
        # Train multiple classifiers with different sampling strategies
        for i in range(self.n_estimators):
            for sampler in self.samplers:
                X_resampled, y_resampled = sampler.fit_resample(X, y)
                clf = clone(self.base_classifier)
                clf.fit(X_resampled, y_resampled)
                self.classifiers.append(clf)
        return self
    
    def predict_proba(self, X):
        probas = np.array([clf.predict_proba(X) for clf in self.classifiers])
        return np.mean(probas, axis=0)

# Example usage
samplers = [
    SMOTE(random_state=42),
    ADASYN(random_state=42),
    RandomOverSampler(random_state=42)
]
base_clf = RandomForestClassifier(random_state=42)
ensemble = ImbalancedEnsemble(samplers, base_clf)

Slide 13: Additional Resources