Skip to content

Commit

Permalink
Draft training pipeline and bug fixes
Browse files Browse the repository at this point in the history
  • Loading branch information
ardagoreci committed Jul 29, 2024
1 parent 6e01552 commit 96e9042
Show file tree
Hide file tree
Showing 26 changed files with 907 additions and 400 deletions.
174 changes: 139 additions & 35 deletions configs/data/erebor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,53 +8,34 @@ train_alignment_dir: "alignment_data/alignments"
train_chain_data_cache_path: "pdb_data/chain_data_cache.json"
template_release_dates_cache_path: "pdb_data/mmcif_cache.json"
train_mmcif_data_cache_path: "pdb_data/mmcif_cache.json"
alignment_index_path: "alignment_data/alignment_dbs/alignment_db.index"
alignment_index_path: null # "alignment_data/alignment_dbs/alignment_db.index"
obsolete_pdbs_file_path: "pdb_data/obsolete.dat"
train_filter_path: null

# Distillation data paths
distillation_data_dir: None
distillation_alignment_dir: None
distillation_chain_data_cache_path: None
distillation_filter_path: None
distillation_alignment_index_path: None
distillation_data_dir: null
distillation_alignment_dir: null
distillation_chain_data_cache_path: null
distillation_filter_path: null
distillation_alignment_index_path: null

# Validation data paths
val_data_dir: None
val_alignment_dir: None
predict_data_dir: None
predict_alignment_dir: None
val_data_dir: null
val_alignment_dir: null
predict_data_dir: null
predict_alignment_dir: null
kalign_binary_path: '/usr/bin/kalign'

# Placeholders
placeholders:
NUM_RES: "num residues placeholder"
NUM_MSA_SEQ: "msa placeholder"
NUM_TEMPLATES: "num templates placeholder"

config:
common:
feat: backbone
block_delete_msa:
msa_fraction_per_block: 0.3
randomize_num_blocks: false
num_blocks: 5
masked_msa:
profile_prob: 0.1
same_prob: 0.1
uniform_prob: 0.1
max_recycling_iters: 3
msa_cluster_features: true
reduce_msa_clusters_by_max_templates: false
resample_msa_in_recycling: true
unsupervised_features:
- "aatype"
- "residue_index"
- "msa"
- "num_alignments"
- "seq_length"
- "between_segment_residues"
- "deletion_matrix"
- "no_recycling_iters"
use_templates: false
use_template_torsion_angles: false
seqemb_mode:
enabled: false

supervised:
clamp_prob: 0.9
supervised_features:
Expand All @@ -63,6 +44,7 @@ config:
- "resolution"
- "use_clamped_fape"
- "is_distillation"

predict:
fixed_size: true,
subsample_templates: false # We want top templates.
Expand All @@ -78,6 +60,7 @@ config:
interface_threshold: None
supervised: false
uniform_recycling: false

eval:
fixed_size: true
subsample_templates: false # We want top templates.
Expand All @@ -93,6 +76,7 @@ config:
interface_threshold": None
supervised: true
uniform_recycling: false

train:
fixed_size: true
subsample_templates: true
Expand All @@ -112,9 +96,129 @@ config:
max_distillation_msa_clusters: 1000
uniform_recycling: true
distillation_prob: 0.75

data_module:
use_small_bfd: false
data_loaders:
batch_size: 1
num_workers: 16
pin_memory: true

common:
feat: # Features for AlphaFold 3, single chain, backbone only coordinates
aatype: # [NUM_RES]
- ${placeholders.NUM_RES}
all_atom_mask: # [NUM_RES, 4]
- ${placeholders.NUM_RES}
- null
all_atom_positions: # [NUM_RES, 4, 3]
- ${placeholders.NUM_RES}
- null
- null
ref_pos: # [NUM_RES, 4, 3]
- ${placeholders.NUM_RES}
- null
- null
ref_mask: # [NUM_RES, 4]
- ${placeholders.NUM_RES}
- null
ref_element: # [NUM_RES, 4, 4]
- ${placeholders.NUM_RES}
- null
- null
ref_charge: # [NUM_RES, 4]
- ${placeholders.NUM_RES}
- null
ref_atom_name_chars: # [NUM_RES, 4, 4]
- ${placeholders.NUM_RES}
- null
- null
ref_space_uid: # [NUM_RES, 4]
- ${placeholders.NUM_RES}
- null
atom_to_token: # [NUM_RES, 4]
- ${placeholders.NUM_RES}
- null
is_distillation: [ ]
msa_feat: # [NUM_MSA_SEQ, NUM_RES, 49]
- ${placeholders.NUM_MSA_SEQ}
- ${placeholders.NUM_RES}
- null
msa_mask: # [NUM_MSA_SEQ, NUM_RES]
- ${placeholders.NUM_MSA_SEQ}
- ${placeholders.NUM_RES}
residue_index: # [NUM_RES]
- ${placeholders.NUM_RES}
residx_atom14_to_atom37:
- ${placeholders.NUM_RES}
- null
residx_atom37_to_atom14:
- ${placeholders.NUM_RES}
- null
resolution: [ ]
seq_length: [ ]
seq_mask: # [NUM_RES]
- ${placeholders.NUM_RES}
template_aatype: # [NUM_TEMPLATES, NUM_RES]
- ${placeholders.NUM_TEMPLATES},
- ${placeholders.NUM_RES}
template_all_atom_mask: # [NUM_TEMPLATES, NUM_RES, 4]
- ${placeholders.NUM_TEMPLATES},
- ${placeholders.NUM_RES},
- null
template_all_atom_positions:
- ${placeholders.NUM_TEMPLATES}
- ${placeholders.NUM_RES}
- null
- null
template_backbone_rigid_mask:
- ${placeholders.NUM_TEMPLATES}
- ${placeholders.NUM_RES}
template_backbone_rigid_tensor:
- ${placeholders.NUM_TEMPLATES}
- ${placeholders.NUM_RES}
- null
- null
template_mask:
- ${placeholders.NUM_TEMPLATES}
template_pseudo_beta:
- ${placeholders.NUM_TEMPLATES}
- ${placeholders.NUM_RES}
- null
template_pseudo_beta_mask:
- ${placeholders.NUM_TEMPLATES}
- ${placeholders.NUM_RES}
template_sum_probs:
- ${placeholders.NUM_TEMPLATES}
- null
valid_residues:
- ${placeholders.NUM_RES}
valid_templates:
- ${placeholders.NUM_TEMPLATES}
valid_msa:
- ${placeholders.NUM_MSA_SEQ}
valid_backbone_atoms:
- ${placeholders.NUM_RES}
block_delete_msa:
msa_fraction_per_block: 0.3
randomize_num_blocks: false
num_blocks: 5
masked_msa:
profile_prob: 0.1
same_prob: 0.1
uniform_prob: 0.1
max_recycling_iters: 3
msa_cluster_features: true
reduce_msa_clusters_by_max_templates: false
resample_msa_in_recycling: true
unsupervised_features:
- "aatype"
- "residue_index"
- "msa"
- "num_alignments"
- "seq_length"
- "between_segment_residues"
- "deletion_matrix"
- "no_recycling_iters"
use_templates: false
use_template_torsion_angles: false
14 changes: 8 additions & 6 deletions configs/data/protein.yaml
Original file line number Diff line number Diff line change
@@ -1,20 +1,22 @@
_target_: src.data.protein_datamodule.ProteinDataModule
_target_: src.data.components.protein_datamodule.ProteinDataModule
data_dir: "./data/"
resolution_thr: 3.5 # Resolution threshold for PDB structures
min_seq_id: 0.3 # Minimum sequence identity for MMSeq2 clustering
crop_size: 384 # The number of residues to crop the proteins to.
crop_size: 128 # The number of residues to crop the proteins to.
max_length: 10_000 # Entries with total length of chains larger than max_length will be disregarded.
use_fraction: 1.0 # the fraction of the clusters to use (first N in alphabetic order)
entry_type: "chain" # { "biounit", "chain", "pair" } the type of entries to generate
classes_to_exclude: ['homomer', 'heteromer'] # a list of classes to exclude from the dataset
mask_residues: False # if True, the masked residues will be added to the output
classes_to_exclude: null # a list of classes to exclude from the dataset
# - 'homomer'
# - 'heteromer'
mask_residues: false # if True, the masked residues will be added to the output
lower_limit: 15 # the lower limit of the number of residues to mask
upper_limit: 100 # the upper limit of the number of residues to mask
mask_frac: None # if given, the number of residues to mask is mask_frac times the length of the chain
mask_sequential: False # if True, the masked residues will be neighbors in the sequence; otherwise geometric mask
mask_whole_chains: False # if True, the whole chain is masked
force_binding_sites_frac: 0.15
batch_size: 12 # The batch size. Defaults to `64`.
batch_size: 4 # The batch size. Defaults to `64`.
num_workers: 7 # The number of workers. Defaults to `0`.
pin_memory: True # Whether to pin memory.
debug: True
debug: False
38 changes: 0 additions & 38 deletions configs/loss/defaults.yaml

This file was deleted.

3 changes: 0 additions & 3 deletions configs/loss/fine_tuning.yaml

This file was deleted.

2 changes: 0 additions & 2 deletions configs/loss/initial_training.yaml

This file was deleted.

Loading

0 comments on commit 96e9042

Please sign in to comment.