-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathconfig_multi_gpu_training.yaml
More file actions
116 lines (101 loc) · 3.72 KB
/
config_multi_gpu_training.yaml
File metadata and controls
116 lines (101 loc) · 3.72 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Configuration for Multi-GPU training with learn_lightning.py
# This config matches the learn_monodecoder.py logic but enables multi-GPU distributed training
# Usage: python foldtree2/learn_lightning.py --config config_multi_gpu_training.yaml
# Dataset configuration
dataset: "structs_train_final.h5"
output_dir: "./models/multi_gpu_training"
run_name: "multi_gpu_muon_training"
# Multi-GPU settings
gpus: 4 # Number of GPUs to use (set to number available on your system)
strategy: "ddp" # Distributed Data Parallel - recommended for multi-GPU
# Options: 'auto', 'ddp', 'ddp_spawn', 'dp'
# - ddp: Distributed Data Parallel (fastest, recommended)
# - ddp_spawn: DDP with spawn (slower but more stable)
# - dp: DataParallel (legacy, not recommended)
# Training hyperparameters
epochs: 1000
batch_size: 15 # Per-GPU batch size (effective batch = batch_size * gpus)
gradient_accumulation_steps: 1
seed: 0
# Model architecture
hidden_size: 100
num_embeddings: 40
embedding_dim: 128
# Encoder configuration
use_muon_encoder: false # Standard mk1_Encoder
EMA: true
# Decoder configuration
use_muon_decoders: false # Standard decoders
hetero_gae: false # Use MultiMonoDecoder
# Optimizer settings (Muon optimizer)
use_muon: true
muon_lr: 0.02 # Muon learning rate for hidden weights
adamw_lr: 0.0001 # AdamW learning rate (1e-4)
# Mixed precision training (essential for multi-GPU efficiency)
mixed_precision: true
# pLDDT masking
mask_plddt: true
plddt_threshold: 0.3
# Learning rate scheduling
lr_schedule: "plateau"
lr_warmup_steps: 20
lr_warmup_ratio: 0.05 # Overrides lr_warmup_steps
# Gradient settings
clip_grad: true
# Commitment cost scheduling
use_commitment_scheduling: true
commitment_cost: 0.9
commitment_schedule: "cosine_with_restart"
commitment_warmup_steps: 1000
commitment_start: 0.5
# TensorBoard logging
tensorboard_dir: "./runs/"
# Additional settings
se3_transformer: false
output_fft: false
output_rt: false
# Notes for Multi-GPU Training:
#
# 1. EFFECTIVE BATCH SIZE:
# - Total batch size = batch_size * gpus * gradient_accumulation_steps
# - Example: 15 * 4 * 1 = 60 samples per optimization step
#
# 2. LEARNING RATE SCALING:
# - When using multiple GPUs, you may want to scale the learning rate
# - Common practice: lr_new = lr_base * sqrt(num_gpus) or lr_new = lr_base * num_gpus
# - Current config uses base rates from notebook
#
# 3. DDP vs DP:
# - DDP (Distributed Data Parallel): Recommended, most efficient
# * Each GPU has its own process
# * Gradients synchronized via all-reduce
# * Supports multi-node training
# - DP (DataParallel): Legacy, single-process
# * Uses threading, has GIL overhead
# * Only single-node
#
# 4. SYNCHRONIZATION:
# - DDP synchronizes gradients automatically after each backward pass
# - All GPUs see the same model parameters
# - Batch norm statistics are synced across GPUs
#
# 5. MONITORING:
# - TensorBoard logs from rank 0 (main process) only
# - Each GPU processes different data batches
# - Loss reported is averaged across all GPUs
#
# 6. RUNNING THE SCRIPT:
# Single GPU (for testing):
# python foldtree2/learn_lightning.py --config config_multi_gpu_training.yaml --gpus 1
#
# Multi-GPU (4 GPUs):
# python foldtree2/learn_lightning.py --config config_multi_gpu_training.yaml --gpus 4
#
# Specific GPUs (e.g., GPU 0 and 1):
# CUDA_VISIBLE_DEVICES=0,1 python foldtree2/learn_lightning.py --config config_multi_gpu_training.yaml --gpus 2
#
# 7. TROUBLESHOOTING:
# - Out of memory: Reduce batch_size or hidden_size
# - NCCL errors: Try strategy='ddp_spawn' or check network configuration
# - Hanging: Ensure all GPUs are visible and working (nvidia-smi)
# - Different results: Set seed and use deterministic mode