Data Structures
AIDE provides several key data structures for working with protein sequences and structures. Models will receive these data structures as input in analog to how sklearn receives numpy arrays.
ProteinSequence
ProteinSequence
is the basic unit for representing a protein sequence. It behaves like a string but provides additional functionality specific to protein sequences.
from aide_predict import ProteinSequence
# Create a basic sequence
seq = ProteinSequence("MKLLVLGLPGAGKGT")
# or get from a pdb (see ProteinStructure below)
seq = ProteinSequence.from_pdb("path/to/structure.pdb")
# Add identifier and optional structure
seq = ProteinSequence(
"MKLLVLGLPGAGKGT",
id="P1234",
structure="path/to/structure.pdb"
)
# Create a sequence with an associated MSA
seq = ProteinSequence(
"MKLLVLGLPGAGKGT",
id="P1234",
msa=msa_sequences # ProteinSequences object containing the MSA
)
# From a fasta file, assumed to be aligned. Uses first sequence in file, set's the MSA attribute
seq = ProteinSequence.from_fasta("path/to/aligned.fasta")
seq.has_msa
>>> True
Key attributes and methods:
id
: Optional identifier for the sequencestructure
: Optional associated structure (as ProteinStructure object)msa
: Optional associated multiple sequence alignment (as ProteinSequences object)has_gaps
: Boolean indicating if sequence contains gaps (‘-’ or ‘.’)base_length
: Length excluding gapshas_non_canonical
: Boolean indicating presence of non-standard amino acidsas_array
: Sequence as numpy arrayhas_msa
: Boolean indicating if the sequence has an associated MSAmsa_same_width
: Boolean indicating if the MSA has the same width as the sequenceis_in_msa
: Boolean indicating if the sequence is present in its MSA
Common operations:
# Sequence manipulation
ungapped = seq.with_no_gaps() # Remove gaps
upper_seq = seq.upper() # Convert to uppercase
# Mutation operations
mutated = seq.mutate("A123G") # Single mutation
mutated = seq.mutate(["A123G", "L45R"]) # Multiple mutations
# Compare sequences
positions = seq.mutated_positions(other_seq) # Get positions that differ
mutations = seq.get_mutations(other_seq) # Get mutation strings (e.g., "A123G")
# Create all possible mutations
library = seq.saturation_mutagenesis() # Generate all single mutants
library = seq.saturation_mutagenesis(positions=[1,2,3]) # Specific positions
# align to another sequence
seq2 = seq2.align(seq)
# align to an existing alignment
seq2 = seq2.align(msa) # Align to existing MSA
ProteinSequences
ProteinSequences
manages collections of protein sequences, similar to a list but with additional functionality for protein-specific operations.
from aide_predict import ProteinSequences
# Create from various sources
sequences = ProteinSequences([seq1, seq2, seq3]) # From ProteinSequence objects
sequences = ProteinSequences.from_fasta("sequences.fasta") # From FASTA file
sequences = ProteinSequences.from_a3m("alignment.a3m") # From A3M file
sequences = ProteinSequences.from_list(["MKLL...", "MKLT..."]) # From strings
sequences = ProteinSequences.from_dict({"seq1": "MKLL...", "seq2": "MKLT..."})
sequences = ProteinSequences.from_dict(my_dataframe["sequence"].to_dict())
sequences = ProteinSequences.from_df(my_dataframe) # assumes first column is sequences
sequences = ProteinSequences.from_df(my_dataframe, sequence_col="seq_col", id_col="id_col") # specify columns
sequences, labels = ProteinSequences.from_df(my_dataframe, label_cols='label') # get a array of labels
Key attributes:
aligned
: Boolean indicating if all sequences have same length (including gaps)fixed_length
: Boolean indicating if all sequences have same base length (excluding gaps)width
: Length of sequences if aligned, None otherwisehas_gaps
: Boolean indicating if any sequence contains gapsmutated_positions
: List of positions with variation (for aligned sequences)weights
: Optional weights for each sequence (used in some models)ids
: List of sequence IDs
Common operations:
# Sequence alignment
aligned = sequences.align_all() # Align all sequences
aligned = sequences.align_to(reference_msa) # Align to existing MSA
# Access and manipulation
sequences[0] # Access by index
sequences["seq1"] # Access by ID
sequences.with_no_gaps() # Remove gaps from all sequences
sequences.upper() # Convert all to uppercase
# MSA operations
msa = sequences.msa_process(
focus_seq_id="wild_type",
theta=0.8 # Sequence reweighting parameter
)
# Sampling operations
sampled = sequences.sample(n=100, replace=False, keep_first=True) # Sample sequences, keeping the first one
# Save/export
sequences.to_fasta("output.fasta")
sequences_dict = sequences.to_dict()
# Batching for large datasets
for batch in sequences.iter_batches(batch_size=32):
# Process batch
pass
# Get mapping between aligned and original positions
if sequences.aligned:
alignment_mapping = sequences.get_alignment_mapping()
# Returns dict mapping sequence IDs to lists of positions
# Convert to on-file representation for large MSAs
on_file = sequences.to_on_file("output.fasta") # Save to file and return a file-based object
ProteinStructure
ProteinStructure
represents the 3D structure of a protein, integrating with common structure file formats and analysis tools.
from aide_predict import ProteinStructure
# Create from PDB file
structure = ProteinStructure(
pdb_file="protein.pdb",
chain="A", # Optional chain identifier
plddt_file="confidence.json" # Optional AlphaFold2 confidence scores
)
# Create from AlphaFold2 output folder
structure = ProteinStructure.from_af2_folder(
folder_path="af2_results",
chain="A"
)
Key methods:
# Access structure information
sequence = structure.get_sequence() # Get amino acid sequence
plddt = structure.get_plddt() # Get pLDDT confidence scores
dssp = structure.get_dssp() # Get secondary structure assignments
# Validation
structure.validate_sequence("MKLLVLGLPGAGKGT") # Check if sequence matches structure
# Access underlying structure objects
structure_obj = structure.get_structure() # Get BioPython Structure object
chain_obj = structure.get_chain() # Get specific chain
positions = structure.get_residue_positions() # Get residue numbers
StructureMapper
StructureMapper
helps manage multiple structures and map them to sequences, particularly useful when working with structure-aware models.
from aide_predict import StructureMapper
# Initialize with folder containing structures
mapper = StructureMapper("path/to/structures")
# Structure can be PDB files or AlphaFold2 prediction folders
# Example folder structure:
# structures/
# ├── protein1.pdb
# ├── protein2.pdb
# └── protein3/ # AlphaFold2 output folder
# ├── ranked_0.pdb
# └── ranking_confidence.json
# Assign structures to ProteinSequences already loaded
sequences = mapper.assign_structures(sequences)
# Get available structures
available_ids = mapper.get_available_structures()
# Get ProteinSequences with structures
sequences = mapper.get_protein_sequences()
The StructureMapper is particularly useful when working with structure-aware models like SaProt, which can use structure information to improve predictions:
# Example workflow with structure-aware model
mapper = StructureMapper("structures/")
sequences = ProteinSequences.from_fasta("sequences.fasta")
sequences = mapper.assign_structures(sequences)
model = SaProtLikelihoodWrapper(wt=sequences["wild_type"])
predictions = model.predict(sequences) # Will use structures where available, falling back to the WT structure
ProteinSequencesOnFile
ProteinSequencesOnFile
provides a memory-efficient way to work with large alignments by only loading sequences when needed.
from aide_predict import ProteinSequencesOnFile
# Create from FASTA file
on_file_sequences = ProteinSequencesOnFile("large_alignment.fasta")
# Access properties efficiently without loading everything
print(on_file_sequences.aligned) # Check if aligned
print(on_file_sequences.width) # Get width if aligned
print(len(on_file_sequences)) # Get count of sequences
# Load a specific sequence
seq = on_file_sequences[0] # Get by index
seq = on_file_sequences["seq1"] # Get by ID
# Iterate through sequences without loading all at once
for seq in on_file_sequences:
process_sequence(seq)
# Load into memory if needed
in_memory = on_file_sequences.to_memory()
ProteinTrajectory
NOT YET IMPLEMENTED