Skip to content

Compare Repertoires

Learn how to measure similarity and differences between TCR repertoires.

Quick Reference

from LZGraphs import jensen_shannon_divergence

# Compare two graphs
jsd = jensen_shannon_divergence(graph1, graph2)
print(f"JS Divergence: {jsd:.4f}")  # 0 = identical, 1 = completely different

Building Graphs for Comparison

from LZGraphs import AAPLZGraph
import pandas as pd

# Load two repertoires
data1 = pd.read_csv("repertoire1.csv")
data2 = pd.read_csv("repertoire2.csv")

# Build graphs
graph1 = AAPLZGraph(data1, verbose=False)
graph2 = AAPLZGraph(data2, verbose=False)

Jensen-Shannon Divergence

The most common method for comparing repertoires:

from LZGraphs import jensen_shannon_divergence

jsd = jensen_shannon_divergence(graph1, graph2)
print(f"JS Divergence: {jsd:.4f}")

Interpretation

JSD Value Interpretation
0.0 Identical distributions
0.0-0.1 Very similar
0.1-0.3 Moderately similar
0.3-0.5 Different
0.5-1.0 Very different

Properties

  • Symmetric: JSD(A,B) = JSD(B,A)
  • Bounded: Always between 0 and 1
  • Metric: Satisfies triangle inequality (when square-rooted)

Comparing Multiple Repertoires

Pairwise Comparison

from itertools import combinations
import pandas as pd

repertoires = {
    'healthy1': graph1,
    'healthy2': graph2,
    'disease1': graph3,
    'disease2': graph4
}

# Calculate all pairwise distances
results = []
for (name1, g1), (name2, g2) in combinations(repertoires.items(), 2):
    jsd = jensen_shannon_divergence(g1, g2)
    results.append({
        'repertoire1': name1,
        'repertoire2': name2,
        'jsd': jsd
    })

df = pd.DataFrame(results)
print(df.sort_values('jsd'))

Distance Matrix

import numpy as np

names = list(repertoires.keys())
n = len(names)
dist_matrix = np.zeros((n, n))

for i, name1 in enumerate(names):
    for j, name2 in enumerate(names):
        if i < j:
            jsd = jensen_shannon_divergence(
                repertoires[name1],
                repertoires[name2]
            )
            dist_matrix[i, j] = jsd
            dist_matrix[j, i] = jsd

# Create DataFrame
dist_df = pd.DataFrame(dist_matrix, index=names, columns=names)
print(dist_df)

Heatmap Visualization

import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10, 8))
sns.heatmap(dist_df, annot=True, cmap='RdYlBu_r', vmin=0, vmax=1)
plt.title('Repertoire Similarity (JSD)')
plt.tight_layout()
plt.savefig('repertoire_heatmap.png', dpi=300)

Comparing Diversity Metrics

K1000 Comparison

from LZGraphs import K1000_Diversity, AAPLZGraph

def compare_diversity(data1, data2, name1="Rep1", name2="Rep2"):
    seqs1 = data1['cdr3_amino_acid'].tolist()
    seqs2 = data2['cdr3_amino_acid'].tolist()

    k1000_1 = K1000_Diversity(seqs1, AAPLZGraph.encode_sequence, draws=30)
    k1000_2 = K1000_Diversity(seqs2, AAPLZGraph.encode_sequence, draws=30)

    print(f"{name1} K1000: {k1000_1:.1f}")
    print(f"{name2} K1000: {k1000_2:.1f}")
    print(f"Difference: {abs(k1000_1 - k1000_2):.1f}")

compare_diversity(data1, data2, "Healthy", "Disease")

Entropy Comparison

from LZGraphs import node_entropy, edge_entropy, graph_entropy

def compare_entropy(graph1, graph2, name1="Rep1", name2="Rep2"):
    metrics = {}

    for name, g in [(name1, graph1), (name2, graph2)]:
        metrics[name] = {
            'node_entropy': node_entropy(g),
            'edge_entropy': edge_entropy(g),
            'graph_entropy': graph_entropy(g)
        }

    df = pd.DataFrame(metrics).T
    print(df)
    return df

compare_entropy(graph1, graph2, "Healthy", "Disease")

Cross-Repertoire Analysis

Sequence Probability Across Repertoires

Check how likely sequences from one repertoire are in another:

def cross_probability(sequences, source_graph, target_graph):
    """Calculate sequence probabilities in target graph."""
    results = []

    for seq in sequences:
        try:
            encoded = AAPLZGraph.encode_sequence(seq)
            source_prob = source_graph.walk_probability(encoded, use_log=True)
            target_prob = target_graph.walk_probability(encoded, use_log=True)

            results.append({
                'sequence': seq,
                'source_log_p': source_prob,
                'target_log_p': target_prob,
                'diff': source_prob - target_prob
            })
        except:
            pass

    return pd.DataFrame(results)

# Sample sequences from repertoire 1
sample_seqs = data1['cdr3_amino_acid'].sample(100).tolist()

# Calculate probabilities in both graphs
cross_df = cross_probability(sample_seqs, graph1, graph2)
print(cross_df.describe())

Repertoire-Specific Sequences

Find sequences that are specific to one repertoire:

def find_specific_sequences(sequences, graph1, graph2, threshold=-5):
    """Find sequences specific to graph1 (not in graph2)."""
    specific = []

    for seq in sequences:
        try:
            enc = AAPLZGraph.encode_sequence(seq)
            p1 = graph1.walk_probability(enc, use_log=True)
            p2 = graph2.walk_probability(enc, use_log=True)

            # Specific if much more likely in graph1
            if p1 - p2 > threshold:
                specific.append({
                    'sequence': seq,
                    'log_p_graph1': p1,
                    'log_p_graph2': p2,
                    'specificity': p1 - p2
                })
        except:
            pass

    return pd.DataFrame(specific).sort_values('specificity', ascending=False)

# Find repertoire 1 specific sequences
specific_seqs = find_specific_sequences(
    data1['cdr3_amino_acid'].tolist(),
    graph1, graph2
)
print(specific_seqs.head(10))

Gene Usage Comparison

def compare_gene_usage(graph1, graph2, name1="Rep1", name2="Rep2"):
    """Compare V/J gene usage between repertoires."""

    # V genes
    v1 = graph1.marginal_vgenes
    v2 = graph2.marginal_vgenes

    # Align indices
    all_v = set(v1.index) | set(v2.index)
    v_comparison = pd.DataFrame({
        name1: v1.reindex(all_v).fillna(0),
        name2: v2.reindex(all_v).fillna(0)
    })
    v_comparison['diff'] = v_comparison[name1] - v_comparison[name2]

    # J genes
    j1 = graph1.marginal_jgenes
    j2 = graph2.marginal_jgenes

    all_j = set(j1.index) | set(j2.index)
    j_comparison = pd.DataFrame({
        name1: j1.reindex(all_j).fillna(0),
        name2: j2.reindex(all_j).fillna(0)
    })
    j_comparison['diff'] = j_comparison[name1] - j_comparison[name2]

    return v_comparison, j_comparison

v_comp, j_comp = compare_gene_usage(graph1, graph2, "Healthy", "Disease")
print("Top V gene differences:")
print(v_comp.sort_values('diff', key=abs, ascending=False).head())

Complete Comparison Pipeline

def full_repertoire_comparison(data1, data2, name1="Rep1", name2="Rep2"):
    """Complete comparison of two repertoires."""

    # Build graphs
    print("Building graphs...")
    graph1 = AAPLZGraph(data1, verbose=False)
    graph2 = AAPLZGraph(data2, verbose=False)

    # Basic stats
    print(f"\n{'='*50}")
    print("BASIC STATISTICS")
    print(f"{'='*50}")
    print(f"{name1}: {data1.shape[0]} sequences, {graph1.graph.number_of_nodes()} nodes")
    print(f"{name2}: {data2.shape[0]} sequences, {graph2.graph.number_of_nodes()} nodes")

    # Divergence
    print(f"\n{'='*50}")
    print("DIVERGENCE")
    print(f"{'='*50}")
    jsd = jensen_shannon_divergence(graph1, graph2)
    print(f"Jensen-Shannon Divergence: {jsd:.4f}")

    # Diversity
    print(f"\n{'='*50}")
    print("DIVERSITY")
    print(f"{'='*50}")
    seqs1 = data1['cdr3_amino_acid'].tolist()
    seqs2 = data2['cdr3_amino_acid'].tolist()
    k1 = K1000_Diversity(seqs1, AAPLZGraph.encode_sequence, draws=30)
    k2 = K1000_Diversity(seqs2, AAPLZGraph.encode_sequence, draws=30)
    print(f"{name1} K1000: {k1:.1f}")
    print(f"{name2} K1000: {k2:.1f}")

    # Entropy
    print(f"\n{'='*50}")
    print("ENTROPY")
    print(f"{'='*50}")
    print(f"{name1} node entropy: {node_entropy(graph1):.2f}")
    print(f"{name2} node entropy: {node_entropy(graph2):.2f}")

    return graph1, graph2, jsd

# Run comparison
g1, g2, jsd = full_repertoire_comparison(data1, data2, "Healthy", "Disease")

Next Steps