Balancing training datasets to prevent model bias
Imbalanced datasets are the silent killers of fine-tuned models. If your training data has 95% support tickets about billing and 5% about technical issues, the fine-tuned model will be biased: excellent at billing questions but poor at technical ones. This article covers detecting imbalance, stratified sampling, class weighting, and synthetic oversampling to ensure fair performance across all classes and use cases.
Understanding Class Imbalance
Class imbalance occurs when training examples are unevenly distributed across classes or categories. In a customer support dataset with 1,000 examples:
- Billing: 500 examples (50%)
- Technical: 300 examples (30%)
- Billing: 150 examples (15%)
- Other: 50 examples (5%)
During fine-tuning, the model learns to predict the most common class (billing) with high accuracy but struggles with rare classes (other). This is not a bug; it's optimal loss minimization. The model has learned: "95% of tickets are billing-related, so respond as if they are."
The cost depends on your use case:
- Imbalanced classification: The model achieves 95% overall accuracy (95% billing examples correct, 5% other) but 0% accuracy on the
otherclass. - Imbalanced instruction-following: The model performs well on common instructions but fails on rare ones.
- Imbalanced domains: A fine-tuned customer support model trained on 90% US data will perform poorly in European English.
Detecting Imbalance
Step 1: Compute class distribution
import json
from collections import Counter
def analyze_class_distribution(filepath, class_field="category"):
"""Analyze class distribution in dataset."""
classes = []
with open(filepath) as f:
for line in f:
example = json.loads(line)
if class_field in example:
classes.append(example[class_field])
distribution = Counter(classes)
total = len(classes)
print(f"Total examples: {total}")
print("\nClass distribution:")
for class_label, count in distribution.most_common():
pct = 100 * count / total
print(f" {class_label}: {count} ({pct:.1f}%)")
# Compute imbalance ratio
max_count = distribution.most_common(1)[0][1]
min_count = distribution.most_common()[-1][1]
imbalance_ratio = max_count / min_count
print(f"\nImbalance ratio: {imbalance_ratio:.1f}x")
print("(1.0 = balanced, 10+ = severe imbalance)")
return distribution
# Example: analyze by instruction category
analyze_class_distribution("dataset.jsonl", class_field="category")
Step 2: Visualize distribution
import matplotlib.pyplot as plt
from collections import Counter
import json
def plot_class_distribution(filepath, class_field="category"):
"""Visualize class distribution."""
classes = []
with open(filepath) as f:
for line in f:
ex = json.loads(line)
if class_field in ex:
classes.append(ex[class_field])
distribution = Counter(classes)
labels, counts = zip(*distribution.most_common())
plt.figure(figsize=(10, 6))
plt.bar(labels, counts)
plt.xlabel("Class")
plt.ylabel("Count")
plt.title("Class Distribution in Training Data")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("class_distribution.png")
print("Plot saved to class_distribution.png")
plot_class_distribution("dataset.jsonl")
An imbalance ratio above 10x is severe; 3–10x is moderate; 1–3x is mild.
Balancing Strategy 1: Stratified Sampling
Stratified sampling keeps the original class distribution but uses it for train/val/test splitting. This ensures each split is representative.
import json
from sklearn.model_selection import train_test_split
from collections import defaultdict
def stratified_split(filepath, class_field="category", train_ratio=0.7, val_ratio=0.15, test_ratio=0.15, random_seed=42):
"""Split dataset into train/val/test with stratification by class."""
examples = [json.loads(line) for line in open(filepath)]
# Group by class
by_class = defaultdict(list)
for ex in examples:
class_label = ex.get(class_field, "unknown")
by_class[class_label].append(ex)
train, val, test = [], [], []
# Split each class proportionally
for class_label, class_examples in by_class.items():
# First split: train+val vs test
temp, test_split = train_test_split(
class_examples,
test_size=test_ratio,
random_state=random_seed
)
# Second split: train vs val
train_split, val_split = train_test_split(
temp,
test_size=val_ratio / (train_ratio + val_ratio),
random_state=random_seed
)
train.extend(train_split)
val.extend(val_split)
test.extend(test_split)
# Write splits
with open("train.jsonl", "w") as f:
for ex in train:
f.write(json.dumps(ex) + "\n")
with open("val.jsonl", "w") as f:
for ex in val:
f.write(json.dumps(ex) + "\n")
with open("test.jsonl", "w") as f:
for ex in test:
f.write(json.dumps(ex) + "\n")
print(f"Train: {len(train)} ({100*len(train)/len(examples):.1f}%)")
print(f"Val: {len(val)} ({100*len(val)/len(examples):.1f}%)")
print(f"Test: {len(test)} ({100*len(test)/len(examples):.1f}%)")
stratified_split("dataset.jsonl", class_field="category")
Balancing Strategy 2: Oversampling Rare Classes
If a class has too few examples, duplicate them (carefully) or use synthetic generation.
import json
from collections import Counter
import random
def oversample_to_balance(filepath, output_filepath, target_ratio=1.0, random_seed=42):
"""Oversample rare classes to achieve balance."""
random.seed(random_seed)
examples = [json.loads(line) for line in open(filepath)]
# Count by class
by_class = defaultdict(list)
for ex in examples:
class_label = ex.get("category", "unknown")
by_class[class_label].append(ex)
# Find max count
max_count = max(len(v) for v in by_class.values())
balanced = []
print("Oversampling:")
for class_label, class_examples in by_class.items():
# Oversample to target_ratio of max_count
target_count = int(max_count * target_ratio)
# If class already has enough, keep all
if len(class_examples) >= target_count:
balanced.extend(class_examples)
print(f" {class_label}: {len(class_examples)} (no oversampling needed)")
else:
# Oversample by random duplication
oversampled = class_examples.copy()
deficit = target_count - len(class_examples)
# Randomly sample with replacement
additional = random.choices(class_examples, k=deficit)
oversampled.extend(additional)
balanced.extend(oversampled)
print(f" {class_label}: {len(class_examples)} -> {len(oversampled)} (+{deficit})")
# Write balanced dataset
with open(output_filepath, "w") as f:
for ex in balanced:
f.write(json.dumps(ex) + "\n")
print(f"\nTotal: {len(examples)} -> {len(balanced)} examples")
return balanced
Caution: Oversampling can cause overfitting if you duplicate the exact same examples. Instead, use synthetic data generation (article 7) or targeted data augmentation.
Balancing Strategy 3: Class Weighting
Some fine-tuning APIs support class weights: weight the loss for rare classes higher so the model prioritizes them.
import json
from collections import Counter
import numpy as np
def compute_class_weights(filepath, class_field="category"):
"""Compute class weights for weighted loss (rare classes get higher weight)."""
classes = [json.loads(line).get(class_field) for line in open(filepath)]
class_counts = Counter(classes)
total = len(classes)
n_classes = len(class_counts)
# Weight formula: n_classes / (n_classes * count / total)
# This gives rare classes higher weight
weights = {}
for class_label, count in class_counts.items():
weight = total / (n_classes * count)
weights[class_label] = weight
# Normalize to average = 1
avg_weight = np.mean(list(weights.values()))
weights = {k: v / avg_weight for k, v in weights.items()}
print("Class weights:")
for class_label in sorted(weights.keys(), key=lambda x: weights[x], reverse=True):
print(f" {class_label}: {weights[class_label]:.2f}x")
return weights
weights = compute_class_weights("dataset.jsonl")
# When fine-tuning with an API, pass weights:
# For OpenAI: create weighted loss function
# For Anthropic: integrate into custom loss
# Example JSON format:
weights_json = {
"class_weights": {k: float(v) for k, v in weights.items()}
}
print("\nWeights for fine-tuning API:")
print(json.dumps(weights_json, indent=2))
Balancing Strategy 4: Domain-Based Balancing
For datasets with multiple domains (e.g., US vs UK English, code vs docs), ensure representation:
def balance_by_domain(filepath, output_filepath, domains_field="domain", min_ratio=0.1):
"""Ensure each domain has at least min_ratio of total examples."""
examples = [json.loads(line) for line in open(filepath)]
by_domain = defaultdict(list)
for ex in examples:
domain = ex.get(domains_field, "unknown")
by_domain[domain].append(ex)
total_target = len(examples)
balanced = []
print(f"Balancing by domain (min {100*min_ratio:.0f}% per domain):")
for domain, domain_examples in by_domain.items():
min_count = int(total_target * min_ratio)
if len(domain_examples) < min_count:
# Oversample this domain
additional = random.choices(domain_examples, k=min_count - len(domain_examples))
domain_examples.extend(additional)
balanced.extend(domain_examples)
print(f" {domain}: {len(domain_examples)} examples")
with open(output_filepath, "w") as f:
for ex in balanced:
f.write(json.dumps(ex) + "\n")
print(f"Total: {len(balanced)} examples")
Measuring Balance Effectiveness
After balancing, compute metrics on validation set to verify:
from sklearn.metrics import precision_recall_fscore_support, confusion_matrix
def evaluate_class_performance(predictions, true_labels, class_labels):
"""Compute per-class precision, recall, F1."""
precision, recall, f1, support = precision_recall_fscore_support(
true_labels, predictions, labels=class_labels
)
print("Per-class performance:")
for label, p, r, f in zip(class_labels, precision, recall, f1):
print(f" {label}: P={p:.3f}, R={r:.3f}, F1={f:.3f}")
# Macro average (unweighted mean across classes)
macro_f1 = np.mean(f1)
# Weighted average (weighted by support)
weighted_f1 = np.average(f1, weights=support)
print(f"\nMacro F1 (unweighted): {macro_f1:.3f}")
print(f"Weighted F1 (by class frequency): {weighted_f1:.3f}")
print(f"Gap (should be small): {abs(macro_f1 - weighted_f1):.3f}")
return macro_f1, weighted_f1
A small gap between macro and weighted F1 indicates balanced performance across classes.
Key Takeaways
- Imbalanced datasets cause biased models: rare classes suffer poor performance while common classes excel.
- Detect imbalance by computing class distribution and imbalance ratio (rare/common).
- Use stratified splitting to ensure each fold is representative.
- Oversample rare classes carefully (avoid exact duplication; use synthetic generation).
- Apply class weights if your fine-tuning API supports them.
- Evaluate on macro-average F1 (unweighted per-class) to detect remaining imbalance.
Frequently Asked Questions
How imbalanced is too imbalanced?
An imbalance ratio above 10x (max/min class count) is severe. At 10x, a model achieving 95% overall accuracy might have 0% accuracy on the rare class. Aim to reduce imbalance below 3x through sampling or weighting.
Should I balance before or after cleaning?
Balance after cleaning. Otherwise, you might oversample duplicates or malformed examples. Clean first, then balance.
Can I oversample without creating copies?
Yes. Use synthetic data generation (next article) or targeted augmentation:
import anthropic
client = anthropic.Anthropic()
def augment_rare_class(rare_examples, n_per_example=3):
"""Generate synthetic variations of rare examples."""
augmented = []
for ex in rare_examples:
prompt = f"""
Given this example instruction:
"{ex['instruction']}"
Generate 3 variations of this instruction that a user might ask.
Ensure they're semantically similar but use different wording.
Format: one per line.
"""
msg = client.messages.create(
model="claude-3-5-sonnet-20241022",
max_tokens=200,
messages=[{"role": "user", "content": prompt}]
)
variations = msg.content[0].text.strip().split("\n")
for var in variations:
new_ex = ex.copy()
new_ex["instruction"] = var.strip()
augmented.append(new_ex)
return augmented
What's the downside of perfect balance (1:1 ratio)?
Perfect balance can hurt overall accuracy if classes are naturally imbalanced. If 95% of real-world tickets are billing, forcing 50:50 in training makes the model forget the prior. Use domain knowledge: balance rare classes to 5–20% minimum representation, not 50%.
How do I balance multi-label datasets (examples have multiple labels)?
Stratify by the primary label or use multilabel_train_test_split from sklearn-multilabel:
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
# Assumes y is a binary matrix (n_samples, n_labels)
mss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
for train_idx, test_idx in mss.split(X, y):
X_train, X_test = X[train_idx], X[test_idx]
y_train, y_test = y[train_idx], y[test_idx]