1+ {
2+ "deepspeed_multinode_launcher" : " standard" ,
3+ "offload_optimizer_device" : " cpu" ,
4+ "offload_param_device" : " cpu" ,
5+ "zero3_init_flag" : true ,
6+ "zero3_save_16bit_model" : true ,
7+ "zero_stage" : 3 ,
8+ "overlap_comm" : true ,
9+ "sub_group_size" : 1000000 ,
10+ "stage3_max_live_parameters" : 1000 ,
11+ "stage3_max_reuse_distance" : 1000000 ,
12+ "stage3_gather_16bit_weights_on_model_save" : true ,
13+ "train_batch_size" : " auto" ,
14+ "train_micro_batch_size_per_gpu" : " auto" ,
15+ "gradient_accumulation_steps" : " auto" ,
16+ "reduce_bucket_size" : 5000000 ,
17+ "stage3_prefetch_bucket_size" : 5000000 ,
18+ "stage3_param_persistence_threshold" : 5000000 ,
19+ "memory_efficient_linear" : true ,
20+ "contiguous_gradients" : true ,
21+ "zero_optimization" : {
22+ "stage" : 3 ,
23+ "cpu_offload" : true ,
24+ "contiguous_gradients" : true ,
25+ "sub_group_size" : 1000000 ,
26+ "stage3_prefetch_bucket_size" : 5000000 ,
27+ "stage3_param_persistence_threshold" : 5000000 ,
28+ "stage3_max_live_parameters" : 1000 ,
29+ "stage3_max_reuse_distance" : 1000000 ,
30+ "stage3_gather_16bit_weights_on_model_save" : true
31+ },
32+ "optimizer" : {
33+ "type" : " Adam" ,
34+ "params" : {
35+ "lr" : " auto" ,
36+ "betas" : [
37+ 0.9 ,
38+ 0.999
39+ ],
40+ "eps" : 1e-8 ,
41+ "weight_decay" : 0.0
42+ }
43+ }
44+ }
0 commit comments