DessimozLab
diff --git a/‎docs/representation_conversion_guide.md‎
Lines changed: 66 additions & 0 deletions b/‎docs/representation_conversion_guide.md‎
Lines changed: 66 additions & 0 deletions
diff --git a/‎foldtree2/foldcomp2fasta.py‎
Lines changed: 68 additions & 0 deletions b/‎foldtree2/foldcomp2fasta.py‎
Lines changed: 68 additions & 0 deletions
diff --git a/‎foldtree2/foldcomp_otf.py‎
Lines changed: 33 additions & 0 deletions b/‎foldtree2/foldcomp_otf.py‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎foldtree2/learn_lightning.py‎
Lines changed: 24 additions & 13 deletions b/‎foldtree2/learn_lightning.py‎
Lines changed: 24 additions & 13 deletions
@@ -0,0 +1,66 @@
+# Representation Conversion Guide (PDB ⇄ RT ⇄ Quaternion)
+
+This guide documents the conversion flow used by
+`foldtree2/scripts/test_representation_conversions.py`.
+
+## Overview
+
+The script follows this sequence:
+
+1. **PDB backbone extraction**
+   - Extract per-residue backbone atom coordinates: `N`, `CA`, `C`.
+2. **Backbone coordinates → local frames**
+   - Use `PDB2PyG.compute_local_frame(coords)` with `coords` of shape `(N, 3, 3)` in order `[N, CA, C]`.
+   - Output:
+     - `R`: rotation matrices, shape `(N, 3, 3)`
+     - `t`: translation vectors, shape `(N, 3)`
+3. **Rotation matrices → quaternions**
+   - Use `rotation_matrix_to_quaternion(R)`.
+   - Quaternion convention in this repo: `(w, x, y, z)` (scalar first).
+4. **Quaternions → rotation matrices**
+   - Use `quaternion_to_rotation_matrix(q)` for roundtrip reconstruction.
+5. **RT → chain coordinates**
+   - Use `reconstruct_positions(R, t)` to reconstruct coordinates from transforms.
+
+## Noise Experiments
+
+The script evaluates robustness by injecting noise in each representation:
+
+- **Coordinate noise**: add Gaussian noise to `(N, CA, C)` coordinates, then recompute `R, t, q`.
+- **RT noise**:
+  - left-multiply random small rotations onto `R`
+  - add Gaussian noise to `t`
+- **Quaternion noise**:
+  - add Gaussian noise to quaternion components
+  - renormalize quaternions to unit norm
+  - convert back to rotation matrices
+
+## Losses
+
+For each noisy variant, the script reports:
+
+- **FAPE loss** via `fape_loss(true_R, true_t, pred_R, pred_t, batch)`
+- **lDDT-style loss** via `compute_lddt_loss(true_positions, pred_positions)`
+  where positions come from `reconstruct_positions`
+
+Lower values indicate better consistency with the baseline representation.
+
+## Run
+
+From the repository root:
+
+```bash
+python -m foldtree2.scripts.test_representation_conversions \
+  --pdb-path foldtree2/config/1eei.pdb \
+  --coord-noise 0.25 \
+  --rot-noise-rad 0.05 \
+  --trans-noise 0.10 \
+  --quat-noise 0.05 \
+  --seed 0
+```
+
+## Key Notes
+
+- Use backbone triplets `[N, CA, C]` to define local frames.
+- CA-only coordinates are not sufficient for unique residue local orientation without extra assumptions.
+- Keep quaternion convention consistent as `(w, x, y, z)` throughout conversions.
@@ -0,0 +1,68 @@
+import argparse
+import os
+
+import torch
+
+
+def _read_ids(ids_file):
+    ids = []
+    with open(ids_file, 'r', encoding='utf-8') as f:
+        for line in f:
+            line = line.strip()
+            if not line or line.startswith('#'):
+                continue
+            ids.append(line.split()[0])
+    return ids
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Encode a Foldcomp DB directly to FoldTree2 token FASTA.'
+    )
+    parser.add_argument('model', type=str, help='Path to trained encoder .pt file')
+    parser.add_argument('foldcomp_db', type=str, help='Path to Foldcomp DB basename (without .lookup)')
+    parser.add_argument('output_fasta', type=str, help='Output encoded FASTA path')
+
+    parser.add_argument('--device', type=str, default=None, help='Device (e.g., cuda, cuda:0, cpu)')
+    parser.add_argument('--ids-file', type=str, default=None, help='Optional text file with Foldcomp IDs (one per line)')
+    parser.add_argument('--max-structures', type=int, default=None, help='Optional max number of structures to encode')
+    parser.add_argument('--chunk-size', type=int, default=1024, help='Foldcomp prefetch chunk size (default: 1024)')
+    parser.add_argument('--queue-size', type=int, default=4, help='Producer/consumer queue size (default: 4)')
+    parser.add_argument('--batch-size', type=int, default=16, help='Encoder batch size per chunk (default: 16)')
+    parser.add_argument('--cache-size', type=int, default=0, help='Graph cache size in Foldcomp dataset (default: 0)')
+    parser.add_argument('--no-replace', action='store_true', help='Disable FASTA special-character replacement')
+    parser.add_argument('--quiet', action='store_true', help='Disable progress bar')
+
+    args = parser.parse_args()
+
+    if not os.path.exists(args.model):
+        raise FileNotFoundError(f'Model not found: {args.model}')
+
+    ids = None
+    if args.ids_file is not None:
+        ids = _read_ids(args.ids_file)
+
+    device = torch.device(args.device) if args.device else torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    encoder = torch.load(args.model, map_location=device, weights_only=False)
+    encoder = encoder.to(device)
+    encoder.device = device
+    encoder.eval()
+
+    output = encoder.encode_foldcomp_fasta(
+        foldcomp_db=args.foldcomp_db,
+        filename=args.output_fasta,
+        ids=ids,
+        max_structures=args.max_structures,
+        chunk_size=args.chunk_size,
+        queue_size=args.queue_size,
+        batch_size=args.batch_size,
+        cache_size=args.cache_size,
+        replace=not args.no_replace,
+        verbose=not args.quiet,
+    )
+
+    print(f'Encoded FASTA written to: {output}')
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,33 @@
+import foldcomp
+# 01. Handling a FCZ file
+# Open a fcz file
+with open("test/compressed.fcz", "rb") as fcz:
+  fcz_binary = fcz.read()
+
+  # Decompress
+  (name, pdb) = foldcomp.decompress(fcz_binary) # pdb_out[0]: file name, pdb_out[1]: pdb binary string
+
+  # Save to a pdb file
+  with open(name, "w") as pdb_file:
+    pdb_file.write(pdb)
+
+  # Get data as dictionary
+  data_dict = foldcomp.get_data(fcz_binary) # foldcomp.get_data(pdb) also works
+  # Keys: phi, psi, omega, torsion_angles, residues, bond_angles, coordinates
+  data_dict["phi"] # phi angles (C-N-CA-C)
+  data_dict["psi"] # psi angles (N-CA-C-N)
+  data_dict["omega"] # omega angles (CA-C-N-CA)
+  data_dict["torsion_angles"] # torsion angles of the backbone as list (phi + psi + omega)
+  data_dict["bond_angles"] # bond angles of the backbone as list
+  data_dict["residues"] # amino acid residues as string
+  data_dict["coordinates"] # coordinates of the backbone as list
+
+# 02. Iterate over a database of FCZ files
+# Open a foldcomp database
+ids = ["d1asha_", "d1it2a_"]
+with foldcomp.open("test/example_db", ids=ids) as db:
+  # Iterate through database
+  for (name, pdb) in db:
+      # save entries as seperate pdb files
+      with open(name + ".pdb", "w") as pdb_file:
+        pdb_file.write(pdb)
@@ -537,11 +537,11 @@ def training_step(self, batch, batch_idx):
 
         # lDDT loss
         lddt_loss = torch.tensor(0.0, device=self.device)
-        if (self.args.lddt_weight > 0 or getattr(self.args, 'lddt_loss', False)) and out.get('coords') is not None and hasattr(data, 'coords') and hasattr(data['coords'], 'x'):
+        if (self.args.lddt_weight > 0 or getattr(self.args, 'lddt_loss', False)) and out.get('quat_pred') is not None and out.get('trans_pred') is not None and hasattr(data, 'coords') and hasattr(data['coords'], 'x'):
             from foldtree2.src.losses.losses import batch_lddt_loss
             lddt_loss = batch_lddt_loss(
-                pred_q=out.get('quat', None),
-                pred_t=out.get('trans', None),
+                pred_q=out.get('quat_pred'),
+                pred_t=out.get('trans_pred'),
                 true_coords=data['coords'].x,
                 batch=getattr(data['res'], 'batch', None),
                 plddt=data['plddt'].x if self.args.mask_plddt else None,
@@ -550,26 +550,37 @@ def training_step(self, batch, batch_idx):
 
         # FAPE loss
         fape_loss = torch.tensor(0.0, device=self.device)
-        if (self.args.fape_weight > 0 or getattr(self.args, 'fape_loss', False)) and out.get('quat') is not None and out.get('trans') is not None and hasattr(data, 'quat') and hasattr(data['quat'], 'x') and hasattr(data, 'trans') and hasattr(data['trans'], 'x'):
+        if (self.args.fape_weight > 0 or getattr(self.args, 'fape_loss', False)) and out.get('quat_pred') is not None and out.get('trans_pred') is not None and hasattr(data, 'q_true') and hasattr(data['q_true'], 'x') and hasattr(data, 'coords') and hasattr(data['coords'], 'x'):
             from foldtree2.src.losses.losses import batch_fape_loss
+            _fape_batch = getattr(data['res'], 'batch', None)
+            _pred_disp = out['trans_pred']
+            # Convert CA-to-CA displacements to CA positions (cumsum per structure)
+            # FAPE requires absolute positions; cumsum from origin is translation-invariant
+            if _fape_batch is not None:
+                _pred_pos = torch.zeros_like(_pred_disp)
+                for _b in torch.unique(_fape_batch):
+                    _m = (_fape_batch == _b).nonzero(as_tuple=True)[0]
+                    _pred_pos[_m] = torch.cumsum(_pred_disp[_m], dim=0)
+            else:
+                _pred_pos = torch.cumsum(_pred_disp, dim=0)
             fape_loss = batch_fape_loss(
-                true_q=data['quat'].x,
-                true_t=data['trans'].x,
-                pred_q=out['quat'],
-                pred_t=out['trans'],
-                batch=getattr(data['res'], 'batch', None),
+                true_q=data['q_true'].x,
+                true_t=data['coords'].x,
+                pred_q=out['quat_pred'],
+                pred_t=_pred_pos,
+                batch=_fape_batch,
             )
 
         # Delta loss
         delta_loss_val = torch.tensor(0.0, device=self.device)
-        if (self.args.delta_weight > 0 or getattr(self.args, 'delta_loss', False)) and out.get('coords') is not None and hasattr(data, 'coords') and hasattr(data['coords'], 'x'):
+        if (self.args.delta_weight > 0 or getattr(self.args, 'delta_loss', False)) and (out.get('quat_pred') is not None or out.get('coords') is not None) and hasattr(data, 'coords') and hasattr(data['coords'], 'x'):
             from foldtree2.src.losses.losses import batch_delta_loss
             try:
-                if out.get('quat') is not None and out.get('trans') is not None:
+                if out.get('quat_pred') is not None and out.get('trans_pred') is not None:
                     delta_loss_val = batch_delta_loss(
                         true_ca=data['coords'].x,
-                        pred_q=out['quat'],
-                        pred_t=out['trans'],
+                                pred_q=out['quat_pred'],
+                                pred_t=out['trans_pred'],
                         batch=getattr(data['res'], 'batch', None),
                         plddt=data['plddt'].x if self.args.mask_plddt else None,
                         plddt_thresh=self.args.plddt_threshold if self.args.mask_plddt else 0.0,