modelscope · Jintao-Huang · May 7, 2025 · May 7, 2025 · May 7, 2025
diff --git a/docs/source/Instruction/Megatron-SWIFT训练.md b/docs/source/Instruction/Megatron-SWIFT训练.md
@@ -102,8 +102,8 @@ swift infer \
 I am a language model developed by swift, you can call me swift-robot. How can I assist you?
 ```
 
-- 更多案例：例如packing、多机，可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron)。
 - 若要进行预训练，你可以使用`megatron pt`替代`megatron sft`，这将会使用生成式的template进行训练。
+- **更多案例**：包括packing、多机、32K上下文、MoE模型、预训练，可以查看[这里](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron)。
 
 ## Benchmark
 

diff --git a/docs/source_en/Instruction/Megatron-SWIFT-Training.md b/docs/source_en/Instruction/Megatron-SWIFT-Training.md
@@ -108,8 +108,8 @@ The inference results are as follows:
 I am a language model developed by swift, you can call me swift-robot. How can I assist you?
 ```
 
-- More examples: such as packing and multi-machine, can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron).
 - For pretraining, you can use `megatron pt` instead of `megatron sft`, which will use a generative template for training.
+- **More examples**: Including packing, multi-node training, 32K context, MoE models, and pre-training, can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron).
 
 ## Benchmark
 The speed comparison of full-parameter training for Dense/MoE models using `megatron sft` and `swift sft` on a single machine with eight A800 GPUs is shown below. The corresponding scripts can be found [here](https://github.com/modelscope/ms-swift/tree/main/examples/train/megatron/benchmark).

diff --git a/examples/train/megatron/long_text.sh b/examples/train/megatron/long_text.sh
@@ -0,0 +1,33 @@
+# Env: 4 * A100
+# https://github.com/modelscope/ms-swift/blob/main/examples/train/long_text/zero3.sh
+# Max Length: 32K
+# GPU Memory: 4 * 50GB, Training Speed 23s/it
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+megatron sft \
+    --load Qwen2.5-7B-mcore \
+    --dataset 'ZhipuAI/LongWriter-6k' \
+    --tensor_model_parallel_size 4 \
+    --micro_batch_size 1 \
+    --global_batch_size 8 \
+    --packing true \
+    --recompute_granularity full \
+    --recompute_method uniform \
+    --recompute_num_layers 1 \
+    --train_iters 1000 \
+    --eval_iters 50 \
+    --finetune true \
+    --cross_entropy_loss_fusion true \
+    --lr 1e-5 \
+    --lr_warmup_iters 100 \
+    --min_lr 1e-6 \
+    --save megatron_output/Qwen2.5-7B \
+    --eval_interval 200 \
+    --save_interval 200 \
+    --max_length 32768 \
+    --num_workers 8 \
+    --dataset_num_proc 8 \
+    --no_save_optim true \
+    --no_save_rng true \
+    --sequence_parallel true \
+    --use_flash_attn true
diff --git a/examples/train/packing/streaming.sh b/examples/train/packing/streaming.sh
@@ -1,5 +1,6 @@
 # 4 * 36GB
 # A demo using the Hugging Face dataset
+# The first model weights will be saved around step 70.
 NPROC_PER_NODE=4 \
 MAX_PIXELS=1003520 \
 CUDA_VISIBLE_DEVICES=0,1,2,3 \
@@ -9,20 +10,20 @@ swift sft \
     --train_type lora \
     --dataset 'HF::linxy/LaTeX_OCR:full#20000' \
     --torch_dtype bfloat16 \
-    --max_steps 1000 \
     --attn_impl flash_attn \
     --streaming true \
     --shuffle_buffer_size 1000 \
     --packing true \
+    --save_strategy epoch \
+    --max_steps 1000 \
+    --max_epochs 5 \
     --per_device_train_batch_size 1 \
     --per_device_eval_batch_size 1 \
     --learning_rate 1e-4 \
     --lora_rank 8 \
     --lora_alpha 32 \
     --target_modules all-linear \
     --gradient_accumulation_steps 1 \
-    --eval_steps 100 \
-    --save_steps 100 \
     --save_total_limit 2 \
     --logging_steps 5 \
     --max_length 8192 \