Examples¶
Practical examples you can copy-paste and run. Each example is self-contained and uses the sample datasets included in the repository.
Jupyter Notebooks¶
Full interactive notebooks are available on GitHub:
1. Getting Started¶
Beginner — Build a graph, score sequences, simulate new ones.
2. Analytics and Diversity¶
Intermediate — Hill numbers, richness predictions, occupancy models.
3. Advanced Usage¶
Advanced — Posterior personalization, feature alignment, PGEN distributions.
To run them locally:
git clone https://github.com/MuteJester/LZGraphs.git
cd LZGraphs
pip install . && pip install jupyter
cd examples && jupyter notebook
Example 1: Basic repertoire analysis¶
A complete workflow from loading data to measuring diversity.
import csv
from LZGraphs import LZGraph
# Load sequences from a CSV file
seqs, v_genes, j_genes = [], [], []
with open("examples/ExampleData3.csv") as f:
for row in csv.DictReader(f):
seqs.append(row['cdr3_amino_acid'])
v_genes.append(row.get('v_call', ''))
j_genes.append(row.get('j_call', ''))
# Build the graph
graph = LZGraph(seqs, variant='aap')
print(graph)
# LZGraph(variant='aap', nodes=1721, edges=9644)
# Basic statistics
print(f"Sequences: {graph.n_sequences}")
print(f"Lengths: {sorted(graph.length_distribution.keys())}")
print(f"Density: {graph.density:.4f}")
# Score a known sequence
log_p = graph.lzpgen("CASSLEPSGGTDTQYF")
print(f"\nlog P(CASSLEPSGGTDTQYF) = {log_p:.2f}")
# Diversity
print(f"\nEffective diversity D(1): {graph.effective_diversity():,.0f}")
print(f"Inverse Simpson D(2): {graph.hill_number(2):,.0f}")
# Predicted richness at different depths
for depth in [1_000, 10_000, 100_000]:
r = graph.predicted_richness(depth)
print(f"At depth {depth:>7,d}: {r:>8,.0f} unique sequences")
Example 2: Comparing two repertoires¶
Build graphs from two samples and measure their divergence.
import csv
from LZGraphs import LZGraph, jensen_shannon_divergence
# Split data into two "repertoires"
with open("examples/ExampleData3.csv") as f:
all_seqs = [row['cdr3_amino_acid'] for row in csv.DictReader(f)]
seqs_a = all_seqs[:2500]
seqs_b = all_seqs[2500:]
graph_a = LZGraph(seqs_a, variant='aap')
graph_b = LZGraph(seqs_b, variant='aap')
# Jensen-Shannon Divergence
jsd = jensen_shannon_divergence(graph_a, graph_b)
print(f"JSD: {jsd:.4f}")
# Compare diversity profiles
for alpha in [0, 1, 2]:
da = graph_a.hill_number(alpha)
db = graph_b.hill_number(alpha)
print(f"D({alpha}): A = {da:,.0f} B = {db:,.0f}")
# Cross-score: how probable are A's sequences under B's model?
import numpy as np
cross_scores = graph_b.lzpgen(seqs_a[:100])
print(f"\nMean log P(A seqs | B model): {np.mean(cross_scores):.2f}")
Example 3: Sequence generation and evaluation¶
Generate synthetic sequences and verify they match the repertoire's patterns.
Setup
Examples 3-6 assume seqs and graph from Example 1 above.
from collections import Counter
import numpy as np
# Generate 10,000 sequences
result = graph.simulate(10_000, seed=42)
# Length distribution of generated sequences
gen_lengths = Counter(len(s) for s in result.sequences)
print("Generated length distribution:")
for length in sorted(gen_lengths):
bar = "#" * (gen_lengths[length] // 100)
print(f" {length:2d}: {gen_lengths[length]:5d} {bar}")
# Novelty: how many are not in the training data?
train_set = set(seqs)
novel = [s for s in set(result.sequences) if s not in train_set]
print(f"\nUnique generated: {len(set(result.sequences))}")
print(f"Novel (not in training): {len(novel)}")
print(f"Novelty rate: {len(novel) / len(set(result.sequences)):.1%}")
# Log-probability statistics
print(f"\nSimulated log P: mean={result.log_probs.mean():.2f}, "
f"std={result.log_probs.std():.2f}")
Example 4: Graph algebra¶
Combine and decompose repertoires using set operations.
from LZGraphs import LZGraph
# Split into two "conditions" for demonstration
healthy_seqs = seqs[:2500]
disease_seqs = seqs[2500:]
healthy = LZGraph(healthy_seqs, variant='aap')
disease = LZGraph(disease_seqs, variant='aap')
# Union: combined repertoire
combined = healthy | disease
print(f"Healthy: {healthy.n_edges} edges")
print(f"Disease: {disease.n_edges} edges")
print(f"Combined: {combined.n_edges} edges")
# Intersection: shared structure
shared = healthy & disease
print(f"Shared: {shared.n_edges} edges")
# Difference: what's unique to disease
disease_only = disease - healthy
print(f"Disease-specific: {disease_only.n_edges} edges")
# Score a candidate against the disease-specific model
candidate = "CASSLGQAYEQYF"
print(f"\nlog P(candidate | disease-specific): {disease_only.lzpgen(candidate):.2f}")
print(f"log P(candidate | healthy): {healthy.lzpgen(candidate):.2f}")
Example 5: ML feature extraction¶
Build a classification pipeline using graph-derived features.
import numpy as np
from LZGraphs import LZGraph
# Build a reference graph from all data
ref = LZGraph(seqs, variant='aap')
print(f"Feature dimension: {ref.n_nodes}")
# Split into "samples" of 500 seqs each for demonstration
samples = [seqs[i:i+500] for i in range(0, 5000, 500)]
labels = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] # first 5 "healthy", last 5 "disease"
# Extract features for each sample
def get_features(sample_seqs):
g = LZGraph(sample_seqs, variant='aap')
aligned = ref.feature_aligned(g) # reference-aligned (n_nodes dim)
stats = g.feature_stats() # 15-dim summary
mass = g.feature_mass_profile() # 31-dim position profile
return np.concatenate([aligned, stats, mass])
X = np.array([get_features(s) for s in samples])
y = np.array(labels)
print(f"Feature matrix: {X.shape}")
# Train a classifier (requires scikit-learn)
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(n_estimators=100, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
print(f"Accuracy: {scores.mean():.1%} +/- {scores.std():.1%}")
Example 6: Bayesian personalization¶
Adapt a population model to an individual patient.
from LZGraphs import LZGraph
# Population-level graph (using first 4000 sequences as "population")
population = LZGraph(seqs[:4000], variant='aap')
# One patient's sequences (last 1000 as "individual")
patient_seqs = seqs[4000:]
# Create personalized model with different prior strengths
for kappa in [1.0, 10.0, 100.0]:
personal = population.posterior(patient_seqs, kappa=kappa)
d1 = personal.effective_diversity()
print(f"kappa={kappa:5.1f} D(1)={d1:,.0f} "
f"nodes={personal.n_nodes} edges={personal.n_edges}")
Small \(\kappa\) → the patient's data dominates (personalized). Large \(\kappa\) → the population prior dominates (conservative).
Sample Data¶
The repository includes these datasets for testing and examples:
| File | Sequences | Content |
|---|---|---|
examples/ExampleData3.csv |
5,000 | Amino acid CDR3 sequences with V/J gene annotations |
See Also¶
- Getting Started — installation and first graph
- Tutorials — step-by-step learning paths
- API Reference — complete method documentation