Dataset Diversity Checker – Quick SSIM Tool for LoRA Training

Dataset Diversity Checker – Quick SSIM Tool for Character LoRA Training

(Not that type of diversity, Mr. Secretary of ~~Defense~~...War, calm down.)

A simple Python script that tells you instantly how diverse your image dataset is and whether you have any duplicates or near-duplicates.

If you train SDXL character LoRAs, this tool helps you avoid redundant images and ensures your dataset has enough variety for better generalization.

What is SSIM?

SSIM (Structural Similarity Index) measures how visually similar two images are based on luminance, contrast, and structure. It returns a score from 0.0 to 1.0 (1.0 = identical). The script averages SSIM across all image pairs and converts it into a clean 0–100 Diversity Score (higher = more diverse).

Why this is useful for LoRA training

Quickly spots hidden duplicates or near-duplicates before training.
Gives you one clear score + color tier so you know if your dataset is good enough.
Especially helpful for character LoRAs where you want variation in poses, angles, expressions, and lighting.
Runs completely offline, no complicated setup.

Example: My “#1PatriotJesusPizza” character dataset scored 67.3 → 🔵 BLUE – Good Diversity.

How to use it

Put your training images in one folder.
Place the script in the same folder.
Run the .py file (double-click or python dataset_diversity_checker.py).
It will show any duplicates + your overall diversity score and tier.

5-Tier Diversity Scale

🔴 RED (0–20) — Extremely Redundant
🟠 ORANGE (20–40) — High Redundancy
🟡 YELLOW (40–60) — Moderate Diversity
🔵 BLUE (60–80) — Good Diversity ← Your 67.3 is here
🟢 GREEN (80–100) — Excellent Diversity

Download the ready-to-use script below 👇 (attached as dataset_diversity_checker.py)

Or copy the code manually (if you prefer not to download):

Python

"""
Dataset Diversity Checker for Character LoRAs
Simple SSIM-based tool to measure dataset diversity and detect duplicates.
Completely offline, no tracking, no personal data.

Usage: Place this .py file in your image folder and run it.
"""

import os
import numpy as np
from scipy.ndimage import gaussian_filter
from torchvision import transforms
from PIL import Image

def ssim(img1, img2, C1=0.01**2, C2=0.03**2):
    # Simple SSIM (0-1 scale, 1 = identical)
    mu1 = gaussian_filter(img1, 1.5)
    mu2 = gaussian_filter(img2, 1.5)
    mu1_sq = mu1 ** 2
    mu2_sq = mu2 ** 2
    mu1_mu2 = mu1 * mu2
    sigma1_sq = gaussian_filter(img1 ** 2, 1.5) - mu1_sq
    sigma2_sq = gaussian_filter(img2 ** 2, 1.5) - mu2_sq
    sigma12 = gaussian_filter(img1 * img2, 1.5) - mu1_mu2
    ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / ((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
    return ssim_map.mean()

def load_image(path, size=(256, 256)):
    transform = transforms.Compose([transforms.Resize(size), transforms.ToTensor()])
    img = Image.open(path).convert('RGB')
    return transform(img).numpy()

def analyze_dataset():
    # Automatically uses the folder where the .py file is located
    folder = os.path.dirname(os.path.abspath(__file__))
    print(f"📁 Scanning folder: {folder}\n")

    # Get all image files
    image_paths = [os.path.join(folder, f) for f in os.listdir(folder)
                   if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    n = len(image_paths)
    if n < 2:
        print("❌ Need at least 2 images in the folder to analyze.")
        return

    print(f"✅ Found {n} images. Calculating similarities...\n")

    total_similarity = 0.0
    pair_count = 0
    near_duplicates = []
    strong_duplicates = []

    NEAR_THRESHOLD = 0.92
    DUPE_THRESHOLD = 0.98

    for i in range(n):
        img1 = load_image(image_paths[i])
        name1 = os.path.basename(image_paths[i])
        
        for j in range(i + 1, n):
            img2 = load_image(image_paths[j])
            name2 = os.path.basename(image_paths[j])
            
            # Average SSIM across RGB channels
            score = np.mean([ssim(img1[c], img2[c]) for c in range(3)])
            
            total_similarity += score
            pair_count += 1
            
            # Check for duplicates / near-duplicates
            if score >= DUPE_THRESHOLD:
                strong_duplicates.append((name1, name2, score))
            elif score >= NEAR_THRESHOLD:
                near_duplicates.append((name1, name2, score))

    # Calculate overall score
    avg_similarity = total_similarity / pair_count
    diversity_score = (1 - avg_similarity) * 100   # 0-100, higher = more diverse

    # Tier (exactly 20% buckets)
    if diversity_score >= 80:
        tier = "🟢 GREEN - Excellent Diversity"
    elif diversity_score >= 60:
        tier = "🔵 BLUE - Good Diversity"
    elif diversity_score >= 40:
        tier = "🟡 YELLOW - Moderate Diversity"
    elif diversity_score >= 20:
        tier = "🟠 ORANGE - High Redundancy"
    else:
        tier = "🔴 RED - Extremely Redundant"

    # === FINAL OUTPUT ===
    print("=" * 65)
    print("DATASET ANALYSIS COMPLETE")
    print("=" * 65)

    if strong_duplicates:
        print(f"🚨 STRONG DUPLICATES (SSIM ≥ {DUPE_THRESHOLD}):")
        for a, b, s in sorted(strong_duplicates, key=lambda x: -x[2]):
            print(f"   • {a}  ↔  {b}   (score: {s:.4f})")
        print()

    if near_duplicates:
        print(f"⚠️  NEAR-DUPLICATES (SSIM ≥ {NEAR_THRESHOLD}):")
        for a, b, s in sorted(near_duplicates, key=lambda x: -x[2]):
            print(f"   • {a}  ↔  {b}   (score: {s:.4f})")
        print()
    else:
        print("✅ No near-duplicates or duplicates found!\n")

    print(f"OVERALL DATASET DIVERSITY SCORE: {diversity_score:.1f}/100")
    print(f"Diversity Level: {tier}")
    print("=" * 65)
    print("Higher score = more diverse dataset.")

if __name__ == "__main__":
    analyze_dataset()