Draft training pipeline and bug fixes

Calcifer26 · Jul 29, 2024 · 96e9042 · 96e9042
1 parent 6e01552
commit 96e9042
Show file tree

Hide file tree

Showing 26 changed files with 907 additions and 400 deletions.
diff --git a/configs/data/erebor.yaml b/configs/data/erebor.yaml
@@ -8,53 +8,34 @@ train_alignment_dir: "alignment_data/alignments"
 train_chain_data_cache_path: "pdb_data/chain_data_cache.json"
 template_release_dates_cache_path: "pdb_data/mmcif_cache.json"
 train_mmcif_data_cache_path: "pdb_data/mmcif_cache.json"
-alignment_index_path: "alignment_data/alignment_dbs/alignment_db.index"
+alignment_index_path: null # "alignment_data/alignment_dbs/alignment_db.index"
 obsolete_pdbs_file_path: "pdb_data/obsolete.dat"
 train_filter_path: null
 
 # Distillation data paths
-distillation_data_dir: None
-distillation_alignment_dir: None
-distillation_chain_data_cache_path: None
-distillation_filter_path: None
-distillation_alignment_index_path: None
+distillation_data_dir: null
+distillation_alignment_dir: null
+distillation_chain_data_cache_path: null
+distillation_filter_path: null
+distillation_alignment_index_path: null
 
 # Validation data paths
-val_data_dir: None
-val_alignment_dir: None
-predict_data_dir: None
-predict_alignment_dir: None
+val_data_dir: null
+val_alignment_dir: null
+predict_data_dir: null
+predict_alignment_dir: null
 kalign_binary_path: '/usr/bin/kalign'
 
+# Placeholders
+placeholders:
+  NUM_RES: "num residues placeholder"
+  NUM_MSA_SEQ: "msa placeholder"
+  NUM_TEMPLATES: "num templates placeholder"
 
 config:
-  common:
-    feat: backbone
-    block_delete_msa:
-      msa_fraction_per_block: 0.3
-      randomize_num_blocks: false
-      num_blocks: 5
-    masked_msa:
-      profile_prob: 0.1
-      same_prob: 0.1
-      uniform_prob: 0.1
-    max_recycling_iters: 3
-    msa_cluster_features: true
-    reduce_msa_clusters_by_max_templates: false
-    resample_msa_in_recycling: true
-    unsupervised_features:
-      - "aatype"
-      - "residue_index"
-      - "msa"
-      - "num_alignments"
-      - "seq_length"
-      - "between_segment_residues"
-      - "deletion_matrix"
-      - "no_recycling_iters"
-    use_templates: false
-    use_template_torsion_angles: false
   seqemb_mode:
     enabled: false
+
   supervised:
     clamp_prob: 0.9
     supervised_features:
@@ -63,6 +44,7 @@ config:
       - "resolution"
       - "use_clamped_fape"
       - "is_distillation"
+
   predict:
     fixed_size: true,
     subsample_templates: false  # We want top templates.
@@ -78,6 +60,7 @@ config:
     interface_threshold: None
     supervised: false
     uniform_recycling: false
+
   eval:
     fixed_size: true
     subsample_templates: false  # We want top templates.
@@ -93,6 +76,7 @@ config:
     interface_threshold": None
     supervised: true
     uniform_recycling: false
+
   train:
     fixed_size: true
     subsample_templates: true
@@ -112,9 +96,129 @@ config:
     max_distillation_msa_clusters: 1000
     uniform_recycling: true
     distillation_prob: 0.75
+
   data_module:
     use_small_bfd: false
     data_loaders:
         batch_size: 1
         num_workers: 16
         pin_memory: true
+
+  common:
+    feat: # Features for AlphaFold 3, single chain, backbone only coordinates
+      aatype:  # [NUM_RES]
+        - ${placeholders.NUM_RES}
+      all_atom_mask:  # [NUM_RES, 4]
+        - ${placeholders.NUM_RES}
+        - null
+      all_atom_positions: # [NUM_RES, 4, 3]
+        - ${placeholders.NUM_RES}
+        - null
+        - null
+      ref_pos:  # [NUM_RES, 4, 3]
+        - ${placeholders.NUM_RES}
+        - null
+        - null
+      ref_mask:  # [NUM_RES, 4]
+        - ${placeholders.NUM_RES}
+        - null
+      ref_element: # [NUM_RES, 4, 4]
+        - ${placeholders.NUM_RES}
+        - null
+        - null
+      ref_charge:  # [NUM_RES, 4]
+        - ${placeholders.NUM_RES}
+        - null
+      ref_atom_name_chars:  # [NUM_RES, 4, 4]
+        - ${placeholders.NUM_RES}
+        - null
+        - null
+      ref_space_uid:  # [NUM_RES, 4]
+        - ${placeholders.NUM_RES}
+        - null
+      atom_to_token:  # [NUM_RES, 4]
+        - ${placeholders.NUM_RES}
+        - null
+      is_distillation: [ ]
+      msa_feat:  # [NUM_MSA_SEQ, NUM_RES, 49]
+        - ${placeholders.NUM_MSA_SEQ}
+        - ${placeholders.NUM_RES}
+        - null
+      msa_mask:  # [NUM_MSA_SEQ, NUM_RES]
+        - ${placeholders.NUM_MSA_SEQ}
+        - ${placeholders.NUM_RES}
+      residue_index:  # [NUM_RES]
+        - ${placeholders.NUM_RES}
+      residx_atom14_to_atom37:
+        - ${placeholders.NUM_RES}
+        - null
+      residx_atom37_to_atom14:
+        - ${placeholders.NUM_RES}
+        - null
+      resolution: [ ]
+      seq_length: [ ]
+      seq_mask:  # [NUM_RES]
+        - ${placeholders.NUM_RES}
+      template_aatype:  # [NUM_TEMPLATES, NUM_RES]
+        - ${placeholders.NUM_TEMPLATES},
+        - ${placeholders.NUM_RES}
+      template_all_atom_mask:  # [NUM_TEMPLATES, NUM_RES, 4]
+        - ${placeholders.NUM_TEMPLATES},
+        - ${placeholders.NUM_RES},
+        - null
+      template_all_atom_positions:
+        - ${placeholders.NUM_TEMPLATES}
+        - ${placeholders.NUM_RES}
+        - null
+        - null
+      template_backbone_rigid_mask:
+        - ${placeholders.NUM_TEMPLATES}
+        - ${placeholders.NUM_RES}
+      template_backbone_rigid_tensor:
+        - ${placeholders.NUM_TEMPLATES}
+        - ${placeholders.NUM_RES}
+        - null
+        - null
+      template_mask:
+        - ${placeholders.NUM_TEMPLATES}
+      template_pseudo_beta:
+        - ${placeholders.NUM_TEMPLATES}
+        - ${placeholders.NUM_RES}
+        - null
+      template_pseudo_beta_mask:
+        - ${placeholders.NUM_TEMPLATES}
+        - ${placeholders.NUM_RES}
+      template_sum_probs:
+        - ${placeholders.NUM_TEMPLATES}
+        - null
+      valid_residues:
+        - ${placeholders.NUM_RES}
+      valid_templates:
+        - ${placeholders.NUM_TEMPLATES}
+      valid_msa:
+        - ${placeholders.NUM_MSA_SEQ}
+      valid_backbone_atoms:
+        - ${placeholders.NUM_RES}
+    block_delete_msa:
+      msa_fraction_per_block: 0.3
+      randomize_num_blocks: false
+      num_blocks: 5
+    masked_msa:
+      profile_prob: 0.1
+      same_prob: 0.1
+      uniform_prob: 0.1
+    max_recycling_iters: 3
+    msa_cluster_features: true
+    reduce_msa_clusters_by_max_templates: false
+    resample_msa_in_recycling: true
+    unsupervised_features:
+      - "aatype"
+      - "residue_index"
+      - "msa"
+      - "num_alignments"
+      - "seq_length"
+      - "between_segment_residues"
+      - "deletion_matrix"
+      - "no_recycling_iters"
+    use_templates: false
+    use_template_torsion_angles: false
diff --git a/configs/data/protein.yaml b/configs/data/protein.yaml
@@ -1,20 +1,22 @@
-_target_: src.data.protein_datamodule.ProteinDataModule
+_target_: src.data.components.protein_datamodule.ProteinDataModule
 data_dir: "./data/"
 resolution_thr: 3.5  # Resolution threshold for PDB structures
 min_seq_id: 0.3  # Minimum sequence identity for MMSeq2 clustering
-crop_size: 384  # The number of residues to crop the proteins to.
+crop_size: 128  # The number of residues to crop the proteins to.
 max_length: 10_000  # Entries with total length of chains larger than max_length will be disregarded.
 use_fraction: 1.0  #  the fraction of the clusters to use (first N in alphabetic order)
 entry_type: "chain"  # { "biounit", "chain", "pair" } the type of entries to generate
-classes_to_exclude: ['homomer', 'heteromer']  # a list of classes to exclude from the dataset
-mask_residues: False  # if True, the masked residues will be added to the output
+classes_to_exclude: null  # a list of classes to exclude from the dataset
+#   - 'homomer'
+#   - 'heteromer'
+mask_residues: false  # if True, the masked residues will be added to the output
 lower_limit: 15  # the lower limit of the number of residues to mask
 upper_limit: 100  # the upper limit of the number of residues to mask
 mask_frac: None  # if given, the number of residues to mask is mask_frac times the length of the chain
 mask_sequential: False  # if True, the masked residues will be neighbors in the sequence; otherwise geometric mask
 mask_whole_chains: False  # if True, the whole chain is masked
 force_binding_sites_frac: 0.15
-batch_size: 12  # The batch size. Defaults to `64`.
+batch_size: 4  # The batch size. Defaults to `64`.
 num_workers: 7  # The number of workers. Defaults to `0`.
 pin_memory: True  # Whether to pin memory.
-debug: True
+debug: False
diff --git a/configs/loss/defaults.yaml b/configs/loss/defaults.yaml
diff --git a/configs/loss/fine_tuning.yaml b/configs/loss/fine_tuning.yaml
diff --git a/configs/loss/initial_training.yaml b/configs/loss/initial_training.yaml