Skip to content

Generate Sequences

Learn how to generate new sequences that follow your repertoire's statistical patterns.

Quick Reference

# Basic generation
walk = graph.random_walk()

# With gene constraints
walk, v_gene, j_gene = graph.genomic_random_walk()

# Convert to sequence
sequence = ''.join([AAPLZGraph.clean_node(n) for n in walk])

Basic Sequence Generation

Random Walk

Generate a sequence following edge probabilities:

from LZGraphs import AAPLZGraph
import pandas as pd

# Build graph
data = pd.read_csv("repertoire.csv")
graph = AAPLZGraph(data, verbose=False)

# Generate a random walk
walk = graph.random_walk()
print(f"Walk: {walk}")

# Convert to sequence
sequence = ''.join([AAPLZGraph.clean_node(node) for node in walk])
print(f"Sequence: {sequence}")

Generate Multiple Sequences

generated = []
for _ in range(100):
    walk = graph.random_walk()
    seq = ''.join([AAPLZGraph.clean_node(n) for n in walk])
    generated.append(seq)

print(f"Generated {len(generated)} sequences")
print(f"Example: {generated[0]}")

Gene-Constrained Generation

Using genomic_random_walk

Generate sequences consistent with V/J gene usage:

# Generate with gene constraints
walk, v_gene, j_gene = graph.genomic_random_walk()

sequence = ''.join([AAPLZGraph.clean_node(n) for n in walk])
print(f"Sequence: {sequence}")
print(f"V gene: {v_gene}")
print(f"J gene: {j_gene}")

Requirements

genomic_random_walk() requires that the graph was built with V and J gene columns.

Generate with Specific Gene Frequencies

The generated sequences follow the marginal gene distribution:

from collections import Counter

# Generate many sequences
results = []
for _ in range(1000):
    walk, v, j = graph.genomic_random_walk()
    results.append({'v_gene': v, 'j_gene': j})

# Check gene distribution
df = pd.DataFrame(results)
print("V gene distribution:")
print(df['v_gene'].value_counts(normalize=True).head())

# Compare to original
print("\nOriginal V gene distribution:")
print(graph.marginal_vgenes.head())

Advanced Generation

Generate Specific Lengths

Filter generated sequences by length:

def generate_with_length(graph, target_length, max_attempts=1000):
    for _ in range(max_attempts):
        walk = graph.random_walk()
        seq = ''.join([AAPLZGraph.clean_node(n) for n in walk])
        if len(seq) == target_length:
            return seq, walk
    return None, None

# Generate 15-mer
seq, walk = generate_with_length(graph, 15)
if seq:
    print(f"Generated 15-mer: {seq}")

Generate from Specific Start

Start from a specific initial state:

# Check available initial states
print("Initial states:")
print(graph.initial_states)

# Note: random_walk starts from initial states by default
# The initial state is chosen based on observed frequencies

Batch Generation with Statistics

import pandas as pd
from tqdm import tqdm

def generate_repertoire(graph, n_sequences, use_genes=True):
    """Generate a synthetic repertoire."""
    results = []

    for _ in tqdm(range(n_sequences), desc="Generating"):
        if use_genes:
            walk, v, j = graph.genomic_random_walk()
        else:
            walk = graph.random_walk()
            v, j = None, None

        seq = ''.join([AAPLZGraph.clean_node(n) for n in walk])
        results.append({
            'sequence': seq,
            'length': len(seq),
            'v_gene': v,
            'j_gene': j
        })

    return pd.DataFrame(results)

# Generate 1000 sequences
synthetic = generate_repertoire(graph, 1000)
print(synthetic.describe())

Evaluating Generated Sequences

Check Probability

# Generate a sequence
walk, v, j = graph.genomic_random_walk()
sequence = ''.join([AAPLZGraph.clean_node(n) for n in walk])

# Calculate its probability
encoded = AAPLZGraph.encode_sequence(sequence)
pgen = graph.walk_probability(encoded, use_log=True)

print(f"Generated: {sequence}")
print(f"log P(gen): {pgen:.2f}")

Compare to Original Distribution

# Original sequence lengths
original_lengths = pd.Series(graph.lengths)

# Generated sequence lengths
synthetic = generate_repertoire(graph, 1000)
generated_lengths = synthetic['length'].value_counts()

# Plot comparison
import matplotlib.pyplot as plt

fig, ax = plt.subplots()
original_lengths.plot(kind='bar', alpha=0.5, label='Original', ax=ax)
generated_lengths.plot(kind='bar', alpha=0.5, label='Generated', ax=ax)
ax.legend()
ax.set_xlabel('Sequence Length')
ax.set_ylabel('Count')
plt.tight_layout()
plt.savefig('length_comparison.png')

Use Cases

Data Augmentation

# Augment a small repertoire
small_data = pd.read_csv("small_repertoire.csv")  # 100 sequences
graph = AAPLZGraph(small_data)

# Generate more sequences
augmented = generate_repertoire(graph, 1000)

# Combine
combined = pd.concat([
    small_data[['cdr3_amino_acid', 'V', 'J']].rename(
        columns={'cdr3_amino_acid': 'sequence'}
    ),
    augmented[['sequence', 'v_gene', 'j_gene']].rename(
        columns={'v_gene': 'V', 'j_gene': 'J'}
    )
])

Null Model Generation

def generate_null_repertoire(graph, n_sequences):
    """Generate sequences for statistical testing."""
    return generate_repertoire(graph, n_sequences)

# Generate null distribution
null_seqs = generate_null_repertoire(graph, 10000)

# Test a specific sequence against null
test_seq = "CASSLEPSGGTDTQYF"
test_pgen = graph.walk_probability(
    AAPLZGraph.encode_sequence(test_seq),
    use_log=True
)

# Calculate p-value
null_pgens = []
for seq in null_seqs['sequence']:
    try:
        pgen = graph.walk_probability(
            AAPLZGraph.encode_sequence(seq),
            use_log=True
        )
        null_pgens.append(pgen)
    except:
        pass

p_value = sum(1 for p in null_pgens if p <= test_pgen) / len(null_pgens)
print(f"P-value: {p_value:.4f}")

Troubleshooting

"No gene data" Error

try:
    walk, v, j = graph.genomic_random_walk()
except Exception as e:
    print("Gene data not available. Using random_walk instead.")
    walk = graph.random_walk()

Empty Walks

walk = graph.random_walk()
if not walk:
    print("Empty walk generated - check graph structure")
else:
    print(f"Walk length: {len(walk)}")

Next Steps