diff --git a/README.md b/README.md
index 8f2f61a..624d5f0 100644
--- a/README.md
+++ b/README.md
@@ -21,6 +21,7 @@ English | [简体中文](./README_zh-CN.md)
 Pai-Megatron-Patch (https://github.com/alibaba/Pai-Megatron-Patch) is a deep learning training toolkit built for developers to train and predict LLMs & VLMs by using Megatron framework easily. With the continuous development of LLMs, the model structure and scale are rapidly evolving. Although these models can be conveniently manufactured using Transformers or DeepSpeed training framework, the training efficiency is comparably low. This phenomenon becomes even severer when the model scale exceeds 10 billion. The primary objective of Pai-Megatron-Patch is to effectively utilize the computational power of GPUs for LLM. This tool allows convenient training of commonly used LLM with all the accelerating techniques provided by Megatron-LM.
 
 What's New:
+- **Upgrade Qwen2-VL models to support MG2HF ckpts conversion and training with multi-turn complex multimodal samples.** [🔥🔥 2024.12.27]
 - **Support training Qwen2-VL models by using Megatron-Core.** [🔥🔥 2024.11.27]
 - **Support training LLaVA models by using Megatron-Core.** [🔥🔥 2024.11.20]
 - **Add llm auto configurator and apply per seq sft loss for qwen2/2.5 models.** [🔥🔥 2024.10.30]
diff --git a/README_zh-CN.md b/README_zh-CN.md
index b897010..08d7b2c 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -41,6 +41,7 @@ Pai-Megatron-Patch是各类开源大模型和Megatron训练加速引擎之间的
 - [阿里云PAI获得FewCLUE基于大模型的小样本学习双料冠军](https://developer.aliyun.com/article/788081?spm=a2c6h.12873639.article-detail.17.11c5383cHpFZks&tlog=yuekan_8)
 
 新功能：
+- **拓展Qwen2-VL模型权重转换及多轮复杂多模态数据的训练支持** [🔥🔥 2024.12.27]
 - **支持用Megatron-Core框架训练Qwen2-VL模型** [🔥🔥 2024.11.27]
 - **支持用Megatron-Core框架训练LLaVA模型** [🔥🔥 2024.11.20]
 - **添加大模型训练最优吞吐参数自动配置以及针对qwen2/2.5系列模型优化微调per seq sft loss.** [🔥🔥 2024.10.30]
diff --git a/examples/deepseek_v2/pretrain_deepseek.py b/examples/deepseek_v2/pretrain_deepseek.py
index 2f1d4ad..3a04b5c 100644
--- a/examples/deepseek_v2/pretrain_deepseek.py
+++ b/examples/deepseek_v2/pretrain_deepseek.py
@@ -86,7 +86,7 @@ def get_batch(data_iterator):
 
     # TODO: this is pretty hacky, find a better way
     if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
-        return None, None, None, None, None, None
+        return None, None, None, None, None, None, None
 
     args = get_args()
 
@@ -130,7 +130,10 @@ def get_batch(data_iterator):
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)
 
-        return tuple([*batch.values(), packed_seq_params])
+        if args.train_mode == "pretrain":
+            return tuple([*batch.values(), None, packed_seq_params])
+        else:
+            return tuple([*batch.values(), packed_seq_params])
     else:
         raise ValueError("please set correct --dataset ")
 
@@ -181,7 +184,7 @@ def forward_step(data_iterator, model: GPTModel):
 
     # Get the batch.
     timers("batch-generator", log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids, packed_seq_params = get_batch(data_iterator)
+    tokens, labels, loss_mask, attention_mask, position_ids, _, packed_seq_params = get_batch(data_iterator)
     timers("batch-generator").stop()
     output_tensor = model(tokens, position_ids, attention_mask, labels=labels, packed_seq_params=packed_seq_params)
 
diff --git a/examples/llama2/pretrain_mcore_llama.py b/examples/llama2/pretrain_mcore_llama.py
index 035d1a8..8596f2c 100644
--- a/examples/llama2/pretrain_mcore_llama.py
+++ b/examples/llama2/pretrain_mcore_llama.py
@@ -86,7 +86,7 @@ def get_batch(data_iterator):
 
     # TODO: this is pretty hacky, find a better way
     if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
-        return None, None, None, None, None
+        return None, None, None, None, None, None
 
     args = get_args()
 
@@ -101,7 +101,14 @@ def get_batch(data_iterator):
         batch = get_batch_on_this_tp_rank(data_iterator)
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)
-
+        return (
+            batch['tokens'],
+            batch['labels'],
+            batch['loss_mask'],
+            batch['attention_mask'],
+            batch['position_ids'],
+            None
+        )
     else:
         raise ValueError("please set correct --dataset ")
 
@@ -146,7 +153,7 @@ def forward_step(data_iterator, model):
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+    tokens, labels, loss_mask, attention_mask, position_ids, _ = get_batch(
         data_iterator)
     timers('batch-generator').stop()
 
diff --git a/examples/llama3/pretrain_llama.py b/examples/llama3/pretrain_llama.py
index dc101d5..1e1f5f7 100644
--- a/examples/llama3/pretrain_llama.py
+++ b/examples/llama3/pretrain_llama.py
@@ -90,7 +90,7 @@ def get_batch(data_iterator):
 
     # TODO: this is pretty hacky, find a better way
     if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
-        return None, None, None, None, None
+        return None, None, None, None, None, None
 
     args = get_args()
 
@@ -105,7 +105,14 @@ def get_batch(data_iterator):
         batch = get_batch_on_this_tp_rank(data_iterator)
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)
-
+        return (
+            batch['tokens'],
+            batch['labels'],
+            batch['loss_mask'],
+            batch['attention_mask'],
+            batch['position_ids'],
+            None
+        )
     else:
         raise ValueError("please set correct --dataset ")
 
@@ -154,7 +161,7 @@ def forward_step(data_iterator, model: GPTModel):
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+    tokens, labels, loss_mask, attention_mask, position_ids, _ = get_batch(
         data_iterator)
     timers('batch-generator').stop()
 
diff --git a/examples/llama3/pretrain_llama_mcore070.py b/examples/llama3/pretrain_llama_mcore070.py
index 3836fe9..9a75f52 100644
--- a/examples/llama3/pretrain_llama_mcore070.py
+++ b/examples/llama3/pretrain_llama_mcore070.py
@@ -72,7 +72,7 @@ def get_batch(data_iterator):
 
     # TODO: this is pretty hacky, find a better way
     if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
-        return None, None, None, None, None
+        return None, None, None, None, None, None
 
     args = get_args()
 
@@ -87,7 +87,14 @@ def get_batch(data_iterator):
         batch = get_batch_on_this_tp_rank(data_iterator)
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)
-
+        return (
+            batch['tokens'],
+            batch['labels'],
+            batch['loss_mask'],
+            batch['attention_mask'],
+            batch['position_ids'],
+            None
+        )
     else:
         raise ValueError("please set correct --dataset ")
 
@@ -137,7 +144,7 @@ def forward_step(data_iterator, model: GPTModel):
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+    tokens, labels, loss_mask, attention_mask, position_ids, _ = get_batch(
         data_iterator)
     timers('batch-generator').stop()
     output_tensor = model(tokens, position_ids, attention_mask,
diff --git a/examples/llama3_1/pretrain_llama.py b/examples/llama3_1/pretrain_llama.py
index c5abf4e..bf4c8ae 100644
--- a/examples/llama3_1/pretrain_llama.py
+++ b/examples/llama3_1/pretrain_llama.py
@@ -90,7 +90,7 @@ def get_batch(data_iterator):
 
     # TODO: this is pretty hacky, find a better way
     if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
-        return None, None, None, None, None, None
+        return None, None, None, None, None, None, None
 
     args = get_args()
 
@@ -133,7 +133,11 @@ def get_batch(data_iterator):
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)
 
-        return tuple([*batch.values(), packed_seq_params])
+
+        if args.train_mode == "pretrain":
+            return tuple([*batch.values(), None, packed_seq_params])
+        else:
+            return tuple([*batch.values(), packed_seq_params])
     else:
         raise ValueError("please set correct --dataset ")
 
@@ -184,7 +188,7 @@ def forward_step(data_iterator, model: GPTModel):
 
     # Get the batch.
     timers("batch-generator", log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids, packed_seq_params = get_batch(data_iterator)
+    tokens, labels, loss_mask, attention_mask, position_ids, _, packed_seq_params = get_batch(data_iterator)
     timers("batch-generator").stop()
     output_tensor = model(tokens, position_ids, attention_mask, labels=labels, packed_seq_params=packed_seq_params)
 
diff --git a/examples/mistral/pretrain_mcore_mistral.py b/examples/mistral/pretrain_mcore_mistral.py
index ff0b129..6cf0f2a 100644
--- a/examples/mistral/pretrain_mcore_mistral.py
+++ b/examples/mistral/pretrain_mcore_mistral.py
@@ -40,12 +40,12 @@
 from megatron_patch.data import build_pretrain_dataset_from_original
 
 from megatron_patch.data.utils import get_batch_on_this_tp_rank_original, get_batch_on_this_tp_rank_idxmap_sft
-from megatron_patch.model.mixtral.layer_specs import (
+from megatron_patch.model.mixtral_bak.layer_specs import (
     get_gpt_layer_local_spec,
     get_gpt_layer_with_transformer_engine_spec,
 )
-from megatron_patch.model.mixtral.model import GPTModel
-from megatron_patch.model.mixtral.transformer_config import TransformerConfig
+from megatron_patch.model.mixtral_bak.model import GPTModel
+from megatron_patch.model.mixtral_bak.transformer_config import TransformerConfig
 from megatron_patch.tokenizer import build_tokenizer, get_tokenizer
 from megatron.core.packed_seq_params import PackedSeqParams
 
diff --git a/examples/mistral/pretrain_mcore_mistral_bak.py b/examples/mistral/pretrain_mcore_mistral_bak.py
index ba2efec..a9c1113 100644
--- a/examples/mistral/pretrain_mcore_mistral_bak.py
+++ b/examples/mistral/pretrain_mcore_mistral_bak.py
@@ -69,7 +69,7 @@ def get_batch(data_iterator):
 
     # TODO: this is pretty hacky, find a better way
     if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
-        return None, None, None, None, None
+        return None, None, None, None, None, None
 
     args = get_args()
 
@@ -84,7 +84,14 @@ def get_batch(data_iterator):
         batch = get_batch_on_this_tp_rank(data_iterator)
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)
-
+        return (
+            batch['tokens'],
+            batch['labels'],
+            batch['loss_mask'],
+            batch['attention_mask'],
+            batch['position_ids'],
+            None
+        )
     else:
         raise ValueError("please set correct --dataset ")
 
@@ -128,7 +135,7 @@ def forward_step(data_iterator, model):
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+    tokens, labels, loss_mask, attention_mask, position_ids, _ = get_batch(
         data_iterator)
     timers('batch-generator').stop()
 
diff --git a/examples/qwen1_5/pretrain_mcore_qwen.py b/examples/qwen1_5/pretrain_mcore_qwen.py
index 6661a0e..ebdc017 100644
--- a/examples/qwen1_5/pretrain_mcore_qwen.py
+++ b/examples/qwen1_5/pretrain_mcore_qwen.py
@@ -91,7 +91,7 @@ def get_batch(data_iterator):
 
     # TODO: this is pretty hacky, find a better way
     if (not mpu.is_pipeline_first_stage()) and (not mpu.is_pipeline_last_stage()):
-        return None, None, None, None, None
+        return None, None, None, None, None, None
 
     args = get_args()
 
@@ -107,6 +107,14 @@ def get_batch(data_iterator):
         # slice batch along sequence dimension for context parallelism
         batch = get_batch_on_this_cp_rank(batch)
 
+        return (
+            batch['tokens'],
+            batch['labels'],
+            batch['loss_mask'],
+            batch['attention_mask'],
+            batch['position_ids'],
+            None
+        )
     else:
         raise ValueError("please set correct --dataset ")
 
@@ -155,7 +163,7 @@ def forward_step(data_iterator, model: GPTModel):
 
     # Get the batch.
     timers('batch-generator', log_level=2).start()
-    tokens, labels, loss_mask, attention_mask, position_ids = get_batch(
+    tokens, labels, loss_mask, attention_mask, position_ids, _ = get_batch(
         data_iterator)
     timers('batch-generator').stop()
 
diff --git a/examples/qwen2_vl/README.md b/examples/qwen2_vl/README.md
index 88d5dae..4f0e010 100755
--- a/examples/qwen2_vl/README.md
+++ b/examples/qwen2_vl/README.md
@@ -57,6 +57,9 @@ wget https://atp-modelzoo-wlcb-pai.oss-cn-wulanchabu.aliyuncs.com/release/models
 tar -zxf wds.tgz
 ```
 
+对于视频多模态、单样本中包含多张图片、多轮对话等复杂数据集，您需要将其转换为sharegpt格式数据后再使用Megatron-Patch训练。对于sharegpt格式的数据处理，参见[链接](./dataset_preparation.md)。
+
+
 ## Megatron-Core模型训练流程
 ### Megatron-Core模型格式转换
 运行`hf2mcore_qwen2_vl_convertor.sh`脚本，需要传入的参数列表如下
@@ -84,6 +87,21 @@ false \
 bf16
 ```
 
+当您需要将训练好的checkpoint转换回huggingface格式用于推理时，执行
+
+```bash
+cd /workspace/Pai-Megatron-Patch/toolkits/model_checkpoints_convertor/qwen
+bash hf2mcore_qwen2_vl_convertor.sh \
+7B \
+/mnt/qwen2-vl-ckpts/Qwen2-VL-7B-Instruct-tp2pp2 \
+/mnt/qwen2-vl-ckpts/Qwen2-VL-7B-Instruct-tp2pp2-back \
+2  \
+2  \
+true \
+bf16 \
+/mnt/qwen2-vl-ckpts/Qwen2-VL-7B-Instruct
+```
+
 ### Megatron-Core预训练
 
 #### 预训练命令描述
@@ -102,7 +120,7 @@ TP=${10}                        # 模型并行度
 PP=${11}                        # 流水并行度
 CP=${12}                        # 上下文并行度
 DO=${13}                        # 是否使用Megatron版Zero-1降显存优化器: true, false
-FL=${14}                        # 是否优先使用Flash Attention: true, false
+FL=${14}                        # 是否优先使用Flash Attention: false
 AC=${15}                        # 激活检查点模式: sel, full, offload, false
 OPTIMIZER_OFFLOAD=${16}         # 是否启用Offload optimizer: false, static, auto
 SAVE_INTERVAL=${17}             # 保存ckpt的间隔
@@ -123,17 +141,47 @@ sh run_mcore_qwen.sh  \
 dsw  \
 7B   \
 1    \
-256 \
-0.00015   \
+32 \
+1e-5   \
+1e-6   \
+2048  \
+2048  \
+bf16  \
+2   \
+2  \
+1 \
+true \
+false   \
+true \
+false \
+100000  \
+/mnt/llava-datasets/LLaVA-Pretrain/wds   \
+/mnt/llava-datasets/LLaVA-Pretrain/wds   \
+/mnt/qwen2-vl-ckpts/Qwen2-VL-7B-Instruct-tp2pp2 \
+20000  \
+200   \
+/workspace/output_mcore_qwen2vl_pretrain
+```
+
+由于PP切分时，PP Rank 0额外的ViT会导致其负载略高于其他PP Rank，为了达到最佳性能，您可能需要调整`MP_PP0_LAYERS`变量降低PP Rank 0的LLM层数。
+
+```bash
+cd /workspace/Pai-Megatron-Patch/examples/qwen2_vl
+MP_PP0_LAYERS=12 sh run_mcore_qwen.sh  \
+dsw  \
+7B   \
+1    \
+32 \
 1e-5   \
-1024  \
-1024  \
+1e-6   \
+2048  \
+2048  \
 bf16  \
 2   \
 2  \
 1 \
 true \
-true   \
+false   \
 true \
 false \
 100000  \
diff --git a/examples/qwen2_vl/dataset_helpers.py b/examples/qwen2_vl/dataset_helpers.py
index 7041363..474cefe 100644
--- a/examples/qwen2_vl/dataset_helpers.py
+++ b/examples/qwen2_vl/dataset_helpers.py
@@ -1,4 +1,16 @@
-# TODO: Add a License
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import dataclasses
 import re
 import sys
@@ -11,6 +23,7 @@
 import numpy as np
 import torch
 from torchvision import transforms as T
+import json
 
 from megatron.energon import (
     Batch,
@@ -18,6 +31,8 @@
     VQASample,
 )
 
+from megatron_patch.data.energon.chatml import ChatMLSample
+
 from megatron.training import get_args
 from megatron_patch.tokenizer import get_tokenizer
 
@@ -26,14 +41,16 @@
 class ImageTaskSample:
     __key__: str
     __subflavors__: Dict
-    # (c, h, w)
-    imgs: List[np.ndarray]
-    image_thw_grids: List[Tuple[int]]
-    video_thw_grids: List[Tuple[int]]
+    
+    imgs: List[np.ndarray] # (c, h, w)
+    videos: List[np.ndarray] # (c, h, w)
+
+    image_thw_grids: np.ndarray
+    video_thw_grids: np.ndarray
     image_input_mask: np.ndarray
     video_input_mask: np.ndarray
     text: np.ndarray
-    target: torch.Tensor = None
+    target: np.ndarray
 
 # Typing for the resulting batch data after encode_batch()
 @dataclass
@@ -42,6 +59,7 @@ class VQATaskBatch(Batch):
     __subflavors__: List[Dict]
     # (num_tiles, c, h, w)
     imgs: torch.Tensor
+    videos: torch.Tensor
     image_thw_grids: torch.Tensor
     video_thw_grids: torch.Tensor
     image_input_mask: torch.Tensor
@@ -90,7 +108,7 @@ def convert_to_qwen2vl_content(
     
     return contents
 
-class TaskEncoder(DefaultTaskEncoder[VQASample, ImageTaskSample, VQATaskBatch, dict]):
+class TaskEncoder(DefaultTaskEncoder[Union[VQASample, ChatMLSample], ImageTaskSample, VQATaskBatch, dict]):
     """A simple task encoder for captioning."""
 
     def __init__(
@@ -110,43 +128,29 @@ def __init__(
 
         self.seq_len = self.args.max_padding_length
 
-    def encode_sample(self, sample: VQASample):
+    def encode_sample(self, sample: Union[VQASample, ChatMLSample]):
         if isinstance(sample, VQASample):
             is_llava_training = sample.__subflavors__['is_llava_training'] if 'is_llava_training' in sample.__subflavors__ else False
             if is_llava_training:
                 raise NotImplementedError('Sample format not supported')
             else:
                 yield self.encode_vqa(sample)
+        elif isinstance(sample, ChatMLSample):
+            yield self.encode_chatml(sample)
         else:
             raise NotImplementedError('Sample format not supported')
 
-    def encode_vqa(self, sample: VQASample):
-        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
-        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
-
-        if has_video:
-            # Grab the selected frames of the video as a tensor with shape
-            # fhwc: (num_frames, height, width, num_channels).
-            # video_fhwc = sample.image.permute(0, 2, 3, 1)
-            # selected_frames = torch.linspace(
-            #     0, video_fhwc.shape[0] - 1, self.args.num_frames).long()
-            # video_frame_fhwc = video_fhwc[selected_frames]
-            # imgs = []
-            # for video_frame_hwc in video_frame_fhwc:
-            #     imgs += get_visual_transform(
-            #         video_frame_hwc, self.img_h, self.img_w,
-            #         self.args.use_tiling, self.args.max_num_tiles,
-            #         self.args.use_thumbnail, augment=False)
-            raise NotImplementedError()
-        else:
-            # TODO: add args
-            imgs = get_visual_transform(
-                sample.image
-            )
-            resized_height, resized_width = imgs[0].shape[-2:]
-            # shape: c x img_h x img_w
-            # split single image into tiles for dynamic resolution
-            patches = np.tile(np.array(imgs[0]), (self.temporal_patch_size, 1, 1, 1))
+    def _flatten_visual_inputs(self, visuals, is_image: bool = True):
+        flattened = []
+        thw_grids = []
+        for visual in visuals:
+            if is_image:
+                resized_height, resized_width = visual.shape[-2:]
+                patches = np.tile(np.array(visual), (self.temporal_patch_size, 1, 1, 1))
+            else:
+                assert len(visual) % self.temporal_patch_size == 0
+                patches = np.array(visual)
+                resized_height, resized_width = patches.shape[-2:]
 
             channel = patches.shape[1]
             grid_t = patches.shape[0] // self.temporal_patch_size
@@ -162,18 +166,171 @@ def encode_vqa(self, sample: VQASample):
                 self.merge_size,
                 self.patch_size,
             )
-            patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)
+            patches = patches.transpose(0, 3, 6, 4, 7, 2, 1, 5, 8)      
             flatten_patches = patches.reshape(
                 grid_t * grid_h * grid_w, channel * self.temporal_patch_size * self.patch_size * self.patch_size
-            )
+            )     
+            flattened.append(flatten_patches)
+            thw_grids.append((grid_t, grid_h, grid_w))
+        return flattened, np.array(thw_grids)
+
+    def encode_chatml(self, sample: ChatMLSample):
+        # TODO: modify get_visual_transform to add more augmentations
+        imgs = [get_visual_transform(img)[0] for img in sample.imgs]
+        videos = [[get_visual_transform(frame)[0] for frame in video] for video in sample.videos]
+        
+        # NOTE: flatten all images
+        flattened_imgs, image_thw_grids = self._flatten_visual_inputs(imgs, is_image=True)
+        flattened_videos, video_thw_grids = self._flatten_visual_inputs(videos, is_image=False)
+
+        # NOTE: generate qwen2vl conversations
+        conversation = json.loads(sample.conversation) if isinstance(sample.conversation, (str, bytes)) else sample.conversation
+ 
+        role_key = 'from' if 'from' in conversation[0] else 'role'
+        content_key = 'value' if 'from' in conversation[0] else 'content'
+        
+        # NOTE: assume the conversation format is: [System]? (User Assistant)+
+        converted_conversation = []
+        if len(conversation) % 2 == 0:
+            # Default Prompt
+            converted_conversation.append({
+                'role': 'system',
+                'content': 'You are a helpful assistant.'
+            })
+        else:
+            converted_conversation.append({
+                'role': 'system',
+                'content': conversation[0][content_key]
+            })
+            conversation = conversation[1:]
+        
+        EXPECTED_ROLE = ['human', 'gpt']
+        for turn_idx, turn in enumerate(conversation):
+            role = turn[role_key]
+            if role != EXPECTED_ROLE[turn_idx % len(EXPECTED_ROLE)]:
+                raise InternalWarning(f"Expect conversation organized in order: [sys] human gpt human gpt..., but got role '{role}' in turn {turn_idx}")
+            content = turn[content_key]
+
+            if role == 'human':
+                role = 'user'
+                content = convert_to_qwen2vl_content(content)
+            elif role == 'gpt':
+                role = 'assistant'
+            
+            converted_conversation.append({
+                'role': role,
+                'content': content
+            })
+        conversation = converted_conversation
+
+        # NOTE: we need to mask all system/user input tokens and assistant generation prefix tokens
+        input_ids = self.tokenizer.apply_chat_template(conversation, tokenize=True, return_tensors="np")[0]
+        target = input_ids.copy()
+
+        system_prompt_prefix = len(self.tokenizer.apply_chat_template([conversation[0]], tokenize=True))
+        assistant_generation_prefix = 3
+        pad_token_id = self.tokenizer.pad_token_id
+
+        target[:system_prompt_prefix] = pad_token_id
+        offset = system_prompt_prefix
+        for turn_idx, turn in enumerate(conversation[1:]):
+            turn_tokens = self.tokenizer.apply_chat_template([turn], tokenize=True, return_tensors="np")[0]
+            turn_content = turn_tokens[system_prompt_prefix:]
+            n_tokens = len(turn_content)
+            if (target[offset: offset + n_tokens] != turn_content).any():
+                raise InternalWarning("Encode Error")
+
+            if turn['role'] == 'user':
+                target[offset: offset + n_tokens] = pad_token_id
+            elif turn['role'] == 'assistant':
+                target[offset: offset + assistant_generation_prefix] = pad_token_id   
+            offset += n_tokens
+
+        # NOTE: expand image_pad & video_pad
+        merge_length = self.merge_size**2
+        image_token_id, video_token_id = self.tokenizer.encode(['<|image_pad|>', '<|video_pad|>'])
+
+        image_token_indices = np.where(input_ids == image_token_id)[0]
+        assert len(image_token_indices) == len(image_thw_grids), f"With {len(image_thw_grids)} images in the sample, but {len(image_token_indices)} image placeholders!"
+        video_token_indices = np.where(input_ids == video_token_id)[0]
+        assert len(video_token_indices) == len(video_thw_grids), f"With {len(video_thw_grids)} images in the sample, but {len(video_token_indices)} video placeholders!"
+        image_thw_grids, video_thw_grids = np.array(image_thw_grids, dtype=np.int64), np.array(video_thw_grids, dtype=np.int64)
+
+        target_length = (
+            input_ids.shape[0] 
+            - image_thw_grids.shape[0] + image_thw_grids.prod(axis=-1).sum() // merge_length
+            - video_thw_grids.shape[0] + video_thw_grids.prod(axis=-1).sum() // merge_length
+        )
+        if target_length > self.seq_len:
+            raise InternalWarning(f"Long sequence with length {target_length} found, dropped...")
+        final_input_ids = np.zeros(target_length, dtype=input_ids.dtype)
+        final_input_masks = final_input_ids.copy()
+
+        image_idx, video_idx = 0, 0
+        indices = np.sort(np.concatenate([image_token_indices, video_token_indices]))
+
+        cur_x, cur_y = 0, 0
+        for idx in indices:
+            token_id = input_ids[idx]
+            if token_id == image_token_id:
+                size = image_thw_grids[image_idx].prod() // merge_length
+                image_idx += 1
+            elif token_id == video_token_id:
+                size = video_thw_grids[video_idx].prod() // merge_length
+                video_idx += 1
+            # NOTE:
+            # input_ids[cur_x:idx] -> final_input_ids[cur_y:cur_y + idx - cur_x]
+            # input_ids[idx] -> final_input_ids[cur_y + idx - cur_x: cur_y + idx - cur_x + size]
+            final_input_ids[cur_y: cur_y + idx - cur_x] = input_ids[cur_x:idx]
+            final_input_masks[cur_y: cur_y + idx - cur_x] = target[cur_x:idx]
+            cur_y += idx - cur_x
+            final_input_ids[cur_y: cur_y + size] = token_id
+            final_input_masks[cur_y: cur_y + size] = pad_token_id
+            cur_y += size
+            cur_x = idx + 1
+        
+        if cur_x < len(input_ids):
+            final_input_ids[cur_y:] = input_ids[cur_x:]
+            final_input_masks[cur_y:] = target[cur_x:]
 
-            # flatten_patches, (grid_t, grid_h, grid_w)
-            thw_grids = [(grid_t, grid_h, grid_w)]
+        target = np.roll(final_input_masks, shift=-1)
+        target[-1] = pad_token_id
 
-        assert "<image>" in sample.context # ?
+        if (target == pad_token_id).all():
+            raise InternalWarning("Sample with all masked label, dropped.")
 
-        # NOTE: we expect a context is a string with <image> conetnt
+        image_input_mask = final_input_ids == self.tokenizer.image_token_id
+        video_input_mask = final_input_ids == self.tokenizer.video_token_id
+        # collect data
+        return ImageTaskSample(
+            __key__=sample.__key__,
+            __subflavors__=sample.__subflavors__,
+            imgs=flattened_imgs,
+            videos=flattened_videos,
+
+            image_thw_grids=image_thw_grids,
+            video_thw_grids=video_thw_grids,
+
+            image_input_mask=image_input_mask,
+            video_input_mask=video_input_mask,
+
+            text=final_input_ids,
+            target=target,
+        )
+    
+    def encode_vqa(self, sample: VQASample):
+        augment = sample.__subflavors__['augmentation'] if 'augmentation' in sample.__subflavors__ else False
+        has_video = sample.__subflavors__['has_video'] if 'has_video' in sample.__subflavors__ else False
 
+        if has_video:
+            raise NotImplementedError("You should use sharegpt dataset to train with videos.")
+        else:
+            # TODO: add args
+            imgs = get_visual_transform(sample.image)
+            flatten_patches, thw_grids = self._flatten_visual_inputs(imgs, is_image=True)
+
+        assert "<image>" in sample.context # ?
+        # NOTE: we expect a context is a string with <image> conetnt
         if isinstance(sample.answers, list):
             answer_list = sample.answers
             weight_list = np.array(sample.answer_weights).astype(np.float32)
@@ -231,30 +388,49 @@ def encode_vqa(self, sample: VQASample):
         return ImageTaskSample(
             __key__=sample.__key__,
             __subflavors__=sample.__subflavors__,
+
             imgs=flatten_patches,
+            videos=list(),
+
             image_thw_grids=thw_grids,
-            video_thw_grids=None,
+            video_thw_grids=torch.empty([0, 3], dtype=torch.long),
+
             image_input_mask=image_input_mask,
             video_input_mask=None,
+            
             text=input_ids,
             target=target,
         )
 
     def batch(self, samples: List[ImageTaskSample]) -> VQATaskBatch:
         # Stack images to [num_tiles, c, h, w]. If there are no images (text-only), then use a dummy image.
-        imgs = [s.imgs for s in samples]
+        imgs = [img for s in samples for img in s.imgs]
         if len(imgs) > 0:
             imgs = torch.cat([torch.from_numpy(img) for img in imgs])
         else:
             imgs = torch.empty([0, 3 * self.temporal_patch_size * self.patch_size * self.patch_size], dtype=torch.float32)
         
-        thw_grids = [thw_grids for s in samples for thw_grids in s.image_thw_grids]
-        if len(thw_grids) > 0:
-            thw_grids = torch.from_numpy(np.array(thw_grids)).long()
-            assert thw_grids.prod(dim=-1).sum() == imgs.shape[0]
+        image_thw_grids = [thw_grids for s in samples for thw_grids in s.image_thw_grids]
+        if len(image_thw_grids) > 0:
+            image_thw_grids = torch.from_numpy(np.array(image_thw_grids)).long()
+            assert image_thw_grids.prod(dim=-1).sum() == imgs.shape[0]
+        else:
+            image_thw_grids = torch.empty([0, 3], dtype=torch.long)
+        
+        # Stack videos to [num_tiles, c, h, w]. If there are no videos (text-only), then use a dummy video.
+        videos = [video for s in samples for video in s.videos]
+        if len(videos) > 0:
+            videos = torch.cat([torch.from_numpy(video) for video in videos])
         else:
-            thw_grids = torch.empty([0, 3], dtype=torch.long)
+            videos = torch.empty([0, 3 * self.temporal_patch_size * self.patch_size * self.patch_size], dtype=torch.float32)
         
+        video_thw_grids = [thw_grids for s in samples for thw_grids in s.video_thw_grids]
+        if len(video_thw_grids) > 0:
+            video_thw_grids = torch.from_numpy(np.array(video_thw_grids)).long()
+            assert video_thw_grids.prod(dim=-1).sum() == videos.shape[0]
+        else:
+            video_thw_grids = torch.empty([0, 3], dtype=torch.long)
+
         # If the user hasn't defined a target sequence length, then use the max along the sample lengths.
         max_seq_len = self.seq_len
         if not max_seq_len:
@@ -283,8 +459,9 @@ def batch(self, samples: List[ImageTaskSample]) -> VQATaskBatch:
             __keys__=[s.__key__ for s in samples],
             __subflavors__=[s.__subflavors__ for s in samples],
             imgs=imgs,
-            image_thw_grids=thw_grids,
-            video_thw_grids=None,
+            videos=videos,
+            image_thw_grids=image_thw_grids,
+            video_thw_grids=video_thw_grids,
             image_input_mask=torch.from_numpy(image_input_masks),    
             video_input_mask=torch.from_numpy(video_input_masks),
             text=torch.from_numpy(text_mat),
diff --git a/examples/qwen2_vl/dataset_preparation.md b/examples/qwen2_vl/dataset_preparation.md
new file mode 100644
index 0000000..ad5dd58
--- /dev/null
+++ b/examples/qwen2_vl/dataset_preparation.md
@@ -0,0 +1,72 @@
+# 准备Qwen2-VL多模态数据集
+
+当前Qwen2-VL支持特定格式的复杂多模态样本的训练，您可按照下述流程将您的数据集转换为Qwen2-VL的支持格式。
+
+## 原始数据
+
+在转换前，你可能需要自行将数据集转换为**sharegpt格式**，示例如下:
+```json
+[
+  {
+    "conversations": [
+        {
+            "from": "human",
+            "value": "<image>human instruction<image>"
+        },
+        {
+            "from": "gpt",
+            "value": "model response"
+        },
+        {
+            "from": "human",
+            "value": "<video><video>human instruction"
+        },
+        {
+            "from": "gpt",
+            "value": "model response"
+        }
+    ],
+    "images": [
+        "path/to/image1.jpg",
+        "path/to/image2.jpg",
+    ],
+    "videos": [
+        "path/to/video1.mp4",
+        "path/to/video2.mp4"
+    ]
+  },
+  {
+    // another sample ...
+  }
+]
+```
+其中，`images`与`videos`列表保存所有图像/视频的原始路径，且依次与对话中的`<image>`与`<video>`标记对应。
+
+## 抽帧
+在训练前，您需要使用DataJuicer等工具将数据集中的视频转换为一系列帧图像。
+
+以`path/to/video1.mp4`为例，假设其保存在`dataset_root/path/to/video1.mp4`, 最终您导出的帧应当保存在 `dataset_root/path/to/video1/` 这一文件夹。此外，您需要保证帧图像的时间顺序与文件名字典序顺序一致。
+推荐文件名示例如下
+```
+00001.jpg # frame 1
+00002.jpg # frame 2
+...
+```
+
+## 转换
+假设数据集目录文件结构如下:
+```
+dataset_root/
+-   dataset.json
+-   ...
+```
+
+运行以下命令将上述准备好的数据集转换为Qwen2-VL训练使用的webdataset, 并存储到`dataset_root/wds`文件夹
+```
+python toolkits/pretrain_data_preprocessing/convert_custom_dataset_to_wds_chatml.py \
+--dataset-root dataset_root \
+--json dataset.json \
+--train-split 9 \
+--val-split 1 \
+--test-split 0
+```
\ No newline at end of file
diff --git a/examples/qwen2_vl/pretrain_qwen.py b/examples/qwen2_vl/pretrain_qwen.py
index 3895570..320a92e 100644
--- a/examples/qwen2_vl/pretrain_qwen.py
+++ b/examples/qwen2_vl/pretrain_qwen.py
@@ -1,4 +1,16 @@
-
+# Copyright (c) 2024 Alibaba PAI and Nvidia Megatron-LM Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """Pretrain Qwen2-VL."""
 
 import os
@@ -30,6 +42,7 @@
 from megatron_patch.model.qwen2_vl.model import Qwen2VLModel
 
 from megatron_patch.tokenizer import build_tokenizer, get_tokenizer
+from megatron_patch.tensor_parallel import broadcast_data
 
 torch._dynamo.config.suppress_errors = True
 from megatron_patch.model.qwen2_vl.transformer_config import (
@@ -311,14 +324,20 @@ def get_batch(data_iterator):
     else:
         data = None
 
-    data_text = tensor_parallel.broadcast_data(["text"], data, torch.int64)["text"]
-    target = tensor_parallel.broadcast_data(["target"], data, torch.int64)["target"]
+    data_text =  broadcast_data(["text"], data, torch.int64)["text"]
+    target =  broadcast_data(["target"], data, torch.int64)["target"]
+    # shape: num_tiles x c x h x w
+    imgs = broadcast_data(["imgs"], data, torch.float32)["imgs"]
+
     # shape: num_tiles x c x h x w
-    imgs = tensor_parallel.broadcast_data(["imgs"], data, torch.float32)["imgs"]
-    # shape: n_samples
-    thw_grids = tensor_parallel.broadcast_data(["image_thw_grids"], data, torch.long)["image_thw_grids"]
-    image_input_mask = tensor_parallel.broadcast_data(["image_input_mask"], data, torch.bool)["image_input_mask"]
-    video_input_mask = tensor_parallel.broadcast_data(["video_input_mask"], data, torch.bool)["video_input_mask"]
+    videos = broadcast_data(["videos"], data, torch.float32)["videos"]
+    # shape: n_image_samples
+    image_thw_grids = broadcast_data(["image_thw_grids"], data, torch.long)["image_thw_grids"]
+    # shape: n_video_samples
+    video_thw_grids = broadcast_data(["video_thw_grids"], data, torch.long)["video_thw_grids"]
+
+    image_input_mask = broadcast_data(["image_input_mask"], data, torch.bool)["image_input_mask"]
+    video_input_mask = broadcast_data(["video_input_mask"], data, torch.bool)["video_input_mask"]
     torch.cuda.nvtx.range_pop()
 
     torch.cuda.nvtx.range_push("index tokens")
@@ -333,7 +352,7 @@ def get_batch(data_iterator):
     # NOTE: no sequence packing in LLM inputs
     torch.cuda.nvtx.range_push("get_ltor_masks_and_position_ids")
     attention_mask, loss_mask, position_ids = get_ltor_masks_and_position_ids(
-        tokens, thw_grids, None, labels, tokenizer.pad_token_id
+        tokens, image_thw_grids, video_thw_grids, labels, tokenizer.pad_token_id
     )
     torch.cuda.nvtx.range_pop()
 
@@ -344,12 +363,13 @@ def get_batch(data_iterator):
         attention_mask, 
         position_ids, 
         imgs, 
-        thw_grids, 
+        videos,
+        image_thw_grids, 
+        video_thw_grids,
         image_input_mask, 
         video_input_mask
     )
 
-
 def loss_func(loss_mask: torch.Tensor, output_tensor: torch.Tensor):
     """Loss function.
 
@@ -396,17 +416,22 @@ def forward_step(data_iterator, model: Qwen2VLModel):
         attention_mask, 
         position_ids, 
         imgs, 
-        thw_grids, 
+        videos,
+        image_thw_grids, 
+        video_thw_grids,
         image_input_mask, 
         video_input_mask
     ) = get_batch(data_iterator)
     timers("batch-generator").stop()
 
+    vision_data = torch.cat([imgs, videos], dim=0)
+    vision_grid = torch.cat([image_thw_grids, video_thw_grids], dim=0)
+
     output_tensor = model(
         input_ids = tokens,
         position_ids = position_ids,
-        vision_data = imgs,
-        vision_grid_thw = thw_grids,
+        vision_data = vision_data,
+        vision_grid_thw =  vision_grid,
         video_start_index = image_input_mask.sum().cpu().item(),
         image_input_mask = image_input_mask,
         video_input_mask = video_input_mask,
diff --git a/examples/qwen2_vl/run_mcore_qwen.sh b/examples/qwen2_vl/run_mcore_qwen.sh
index 0d288d4..5cc8755 100644
--- a/examples/qwen2_vl/run_mcore_qwen.sh
+++ b/examples/qwen2_vl/run_mcore_qwen.sh
@@ -66,10 +66,9 @@ LR_WARMUP_ITERS=${22}
 
 OUTPUT_BASEPATH=${23}
 ### OTHERS ###
+export NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1
 if [ $FL = true ]; then
-    export NVTE_FLASH_ATTN=1 NVTE_FUSED_ATTN=0
-elif [ $FL = false ]; then
-    export NVTE_FLASH_ATTN=0 NVTE_FUSED_ATTN=1
+    echo "FL=true is invalid for Qwen2-VL due to potential wrong backward computation. Force FusedAttention..."
 fi
 
 if [ $MODEL_SIZE = 2B ]; then
@@ -135,12 +134,21 @@ tie_option=" \
 
 fi
 
-if [ $PP -gt 1 ]; then
-    ETP=$TP
-    EPP=1
+if [ -z ${MP_PP0_LAYERS} ];then
+    uneven_split_option=""
+elif [ ${PP} -gt 1 ]; then
+    _check=$(( ( $NUM_LAYERS - ${MP_PP0_LAYERS} ) % ( ${PP} - 1 ) ))
+    if [ $_check != 0 ]; then
+        echo "With uneven pipelineing the left over layers must be divisible by left over stages."
+        exit -1
+    fi
+
+    uneven_split_option=" \
+        --decoder-first-pipeline-num-layers ${MP_PP0_LAYERS}
+    "
 else
-    ETP=0
-    EPP=0
+    echo "uneven pipeline split must be used when PP > 1"
+    exit -1
 fi
 
 if [ $AC = full ]; then
@@ -294,7 +302,7 @@ megatron_options="  \
 
 run_cmd="torchrun $DISTRIBUTED_ARGS pretrain_qwen.py
  ${megatron_options} ${dataset_option} ${pr_options} ${load_options} ${te_options} ${activation_checkpoint_options} \
- ${do_options} ${gqa_options} ${sft_option} ${tie_option} ${packing_options}"
+ ${do_options} ${gqa_options} ${sft_option} ${tie_option} ${packing_options} ${uneven_split_option}"
 
 echo ${run_cmd}
 eval ${run_cmd}
diff --git a/megatron_patch/data/energon/chatml.py b/megatron_patch/data/energon/chatml.py
new file mode 100644
index 0000000..14d8ea1
--- /dev/null
+++ b/megatron_patch/data/energon/chatml.py
@@ -0,0 +1,64 @@
+# NOTE: add a license
+import warnings
+import pickle
+import torch
+import re
+
+from dataclasses import dataclass
+from typing import List, Union
+
+from webdataset.autodecode import Decoder, imagehandler
+from megatron.energon.epathlib.epath import EPath
+from megatron.energon.flavors.base_dataset import Sample
+from megatron.energon.flavors.webdataset import DefaultDecoderWebdatasetFactory
+
+@dataclass
+class ChatMLSample(Sample):
+    """multi-turn complex samples with images and videos"""
+    imgs: List[torch.Tensor]
+    videos: List[List[torch.Tensor]]
+    conversation: str # JSON string of GPT-format conversations
+
+class NestedImagesHandler:
+    def __init__(self, imagespec):
+        """Create an image handler.
+
+        :param imagespec: short string indicating the type of decoding
+        """
+        self.extensions = ['jpgs', 'videos']
+        self.extensions_mapping = {
+            "jpgs": "jpg",
+            "videos": "jpg"
+        }
+        self.image_handler = imagehandler(imagespec)
+
+    def __call__(self, key, data):
+        """Perform nested image decoding.
+
+        :param key: file name extension
+        :param data: binary data
+        """    
+        extension = re.sub(r".*[.]", "", key)
+        if extension.lower() not in self.extensions:
+            return None
+        data = pickle.loads(data)
+        key = self.extensions_mapping[extension]
+        if extension.lower() == 'jpgs':
+            data = [self.image_handler(key, d) for d in data]
+        else:
+            data = [[self.image_handler(key, d) for d in video] for video in data]
+        return data
+        
+class ChatMLWebdataset(DefaultDecoderWebdatasetFactory[ChatMLSample]):
+    __sample_type__ = ChatMLSample
+
+    def __init__(self, path: EPath, *, auto_decode:bool =True, **kwargs):
+        super().__init__(path, auto_decode=auto_decode, **kwargs)
+        if auto_decode:
+            self._decoder = Decoder(
+                [
+                    imagehandler(self.image_decode),
+                    NestedImagesHandler(self.image_decode),
+                    self._video_decoder,
+                ]
+            )
diff --git a/megatron_patch/model/qwen2_vl/model.py b/megatron_patch/model/qwen2_vl/model.py
index ddb4660..f0125ff 100644
--- a/megatron_patch/model/qwen2_vl/model.py
+++ b/megatron_patch/model/qwen2_vl/model.py
@@ -210,10 +210,12 @@ def forward(
             raise NotImplementedError()
         
         if self.pre_process:
-            vision_embeds = self.vision_model(
-                vision_data=vision_data, # If None, vision model should use intermediate outputs (EPP > 1)
-                grid_thw=vision_grid_thw # should provided in each EPP stage
-            )
+            vision_embeds = None
+            if vision_grid_thw.shape[0] > 0:
+                vision_embeds = self.vision_model(
+                    vision_data=vision_data, # If None, vision model should use intermediate outputs (EPP > 1)
+                    grid_thw=vision_grid_thw # should provided in each EPP stage
+                )
 
             # If running inference, the language model KV cache will be updated for image token positions.
             # Here we store the image tokens sequence length, which can be used as an offset to the KV cache later.
@@ -231,7 +233,7 @@ def forward(
             if use_inference_kv_cache:
                 # NOTE: why not cat here? is it the combined embeddings useless?
                 combined_embeddings = language_embeddings
-            else:
+            elif vision_embeds is not None:
                 if video_start_index == 0:
                     image_embeds = None
                     video_embeds = vision_embeds
@@ -251,6 +253,8 @@ def forward(
                     video_embeds = video_embeds.to(language_embeddings.device, language_embeddings.dtype)
                     language_embeddings[video_input_mask.T] = video_embeds          
                 combined_embeddings = language_embeddings
+            else:
+                combined_embeddings = language_embeddings
         else:
             combined_embeddings = None
 
diff --git a/megatron_patch/model/qwen2_vl/rope_utils.py b/megatron_patch/model/qwen2_vl/rope_utils.py
index 1f1f4f0..fcbec59 100644
--- a/megatron_patch/model/qwen2_vl/rope_utils.py
+++ b/megatron_patch/model/qwen2_vl/rope_utils.py
@@ -121,60 +121,6 @@ def _apply_rotary_pos_emb_bshd(
     t = (t * cos_) + (_rotate_half(t, rotary_interleaved) * sin_)
     return torch.cat((t, t_pass), dim=-1).to(input_dtype)
 
-
-def _get_thd_freqs_on_this_cp_rank(cp_rank: int, cp_size: int, x: Tensor, freqs: Tensor) -> Tensor:
-    if cp_size > 1:
-        cp_seg = x.size(0) // 2
-        full_seqlen = cp_size * x.size(0)
-        return torch.cat(
-            [
-                freqs[cp_rank * cp_seg : (cp_rank + 1) * cp_seg],
-                freqs[full_seqlen - (cp_rank + 1) * cp_seg : full_seqlen - cp_rank * cp_seg],
-            ]
-        )
-    else:
-        return freqs[: x.size(0)]
-
-
-def _apply_rotary_pos_emb_thd(
-    t: Tensor,
-    cu_seqlens: Tensor,
-    freqs: Tensor,
-    rotary_interleaved: bool = False,
-    multi_latent_attention: bool = False,
-    mscale: float = 1.0,
-) -> Tensor:
-    """A baseline implementation of applying RoPE for `thd` format.
-
-    Args:
-        t (Tensor): Input tensor T is of shape [t, h, d]
-        cu_seqlens(Tensor):  Cumulative sum of sequence lengths in a batch for `t`,
-        with shape [b + 1] and dtype torch.int32.
-        freqs (Tensor): Rotary Positional embedding tensor freq is of shape [max_s, 1, 1, d]
-
-    Returns:
-        Tensor: Shape [t, h, d]. The input tensor after applying RoPE.
-    """
-
-    cp_size = parallel_state.get_context_parallel_world_size()
-    cp_rank = parallel_state.get_context_parallel_rank()
-    cu_seqlens = cu_seqlens // cp_size
-    seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-
-    return torch.cat(
-        [
-            _apply_rotary_pos_emb_bshd(
-                x.unsqueeze(1),
-                _get_thd_freqs_on_this_cp_rank(cp_rank, cp_size, x, freqs),
-                rotary_interleaved=rotary_interleaved,
-                multi_latent_attention=multi_latent_attention,
-                mscale=mscale,
-            )
-            for x in torch.split(t, seqlens)
-        ]
-    ).squeeze(1)
-
-
 def apply_rotary_pos_emb(
     t: Tensor,
     freqs: Tensor,
@@ -184,44 +130,22 @@ def apply_rotary_pos_emb(
 ):
     """
     Reroute to the appropriate apply_rotary_pos_emb function depending on
-    fused/unfused kernels, or bshd (conventional) / thd (packed seq) format
+    fused/unfused kernels
+
+    NOTE: the RoPE of vision model should not be applied like thd-format because all
+    freqs of each token have been computed.
     """
 
     if config.apply_rope_fusion:
-        if cu_seqlens is None:
-            return fused_apply_rotary_pos_emb(t, freqs)
-        else:
-            cp_size = parallel_state.get_context_parallel_world_size()
-            if cp_size > 1:
-                if not is_te_min_version("1.11.0", check_equality=False):
-                    raise ValueError("Only TE >= 1.12 supports RoPE fusion for THD format with CP.")
-                return fused_apply_rotary_pos_emb_thd(
-                    t,
-                    cu_seqlens,
-                    freqs,
-                    cp_size=cp_size,
-                    cp_rank=parallel_state.get_context_parallel_rank(),
-                )
-            else:
-                return fused_apply_rotary_pos_emb_thd(t, cu_seqlens, freqs)
-    else:
-        if cu_seqlens is None:
-            return _apply_rotary_pos_emb_bshd(
-                t,
-                freqs,
-                rotary_interleaved=config.rotary_interleaved,
-                multi_latent_attention=config.multi_latent_attention,
-                mscale=mscale,
-            )
-        else:
-            return _apply_rotary_pos_emb_thd(
-                t,
-                cu_seqlens,
-                freqs,
-                rotary_interleaved=config.rotary_interleaved,
-                multi_latent_attention=config.multi_latent_attention,
-                mscale=mscale,
-            )
+        return fused_apply_rotary_pos_emb(t.unsqueeze(1), freqs).squeeze(1)
+
+    return _apply_rotary_pos_emb_bshd(
+        t.unsqueeze(1),
+        freqs,
+        rotary_interleaved=config.rotary_interleaved,
+        multi_latent_attention=config.multi_latent_attention,
+        mscale=mscale,
+    ).squeeze(1)
 
 
 def apply_rotary_pos_emb_with_cos_sin(
diff --git a/megatron_patch/model/qwen2_vl/visionmodel.py b/megatron_patch/model/qwen2_vl/visionmodel.py
index 9bd4d33..c9180a2 100644
--- a/megatron_patch/model/qwen2_vl/visionmodel.py
+++ b/megatron_patch/model/qwen2_vl/visionmodel.py
@@ -2,6 +2,7 @@
 
 import torch
 from torch import nn
+from torch.nn import functional as F
 
 from megatron.core.models.common.vision_module.vision_module import VisionModule
 from megatron.core.transformer.enums import ModelType
@@ -215,8 +216,8 @@ def forward(
         return self.projection(hidden_states.view(-1, self.merge_hidden_size))
 
     def build_packed_seq_params(self, grid_thw: torch.Tensor) -> PackedSeqParams:
-        cu_seqlens = torch.zeros(grid_thw.shape[0] + 1, dtype=torch.int, device=grid_thw.device)
-        cu_seqlens[1:] = torch.prod(grid_thw, dim=-1).cumsum(dim=0)
+        cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum(dim=0)
+        cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0).int()
         return PackedSeqParams(
             cu_seqlens_q=cu_seqlens,
             cu_seqlens_kv=cu_seqlens,
diff --git a/megatron_patch/tensor_parallel.py b/megatron_patch/tensor_parallel.py
new file mode 100644
index 0000000..8e9434c
--- /dev/null
+++ b/megatron_patch/tensor_parallel.py
@@ -0,0 +1,105 @@
+# NOTE: add PAI license Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
+# NOTE: we slightly modify this file to support zero-size tensor
+import torch
+
+from megatron.core.parallel_state import (
+    get_tensor_model_parallel_group,
+    get_tensor_model_parallel_rank,
+    get_tensor_model_parallel_src_rank,
+)
+
+_MAX_DATA_DIM = 5
+
+
+def _check_data_types(keys, data, target_dtype):
+    """Check that all the keys have the same target data type."""
+    for key in keys:
+        assert (
+            data[key].dtype == target_dtype
+        ), '{} has data type {} which ' 'is different than {}'.format(
+            key, data[key].dtype, target_dtype
+        )
+
+
+def _build_key_size_numel_dictionaries(keys, data):
+    """Build the size on rank 0 and broadcast."""
+    max_dim = _MAX_DATA_DIM
+    sizes = [-1 for _ in range(max_dim) for _ in keys]
+
+    # Pack the sizes on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        offset = 0
+        for key in keys:
+            assert data[key].dim() < max_dim, 'you should increase MAX_DATA_DIM'
+            size = data[key].size()
+            for i, s in enumerate(size):
+                sizes[i + offset] = s
+            offset += max_dim
+
+    # Move to GPU and broadcast.
+    sizes_cuda = torch.tensor(sizes, dtype=torch.long, device='cuda')
+    torch.distributed.broadcast(
+        sizes_cuda, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+    )
+
+    # Move back to cpu and unpack.
+    sizes_cpu = sizes_cuda.cpu()
+    key_size = {}
+    key_numel = {}
+    total_numel = 0
+    offset = 0
+    for key in keys:
+        i = 0
+        size = []
+        numel = 1
+        while sizes_cpu[offset + i] >= 0:
+            this_size = sizes_cpu[offset + i]
+            size.append(this_size)
+            numel *= this_size
+            i += 1
+        key_size[key] = size
+        key_numel[key] = numel
+        total_numel += numel
+        offset += max_dim
+
+    return key_size, key_numel, total_numel
+
+
+def broadcast_data(keys, data, datatype):
+    """Broadcast data from rank zero of each model parallel group to the
+    members of the same model parallel group.
+
+    Args:
+        keys: list of keys in the data disctionary to be broadcasted
+        data: data dictionary of string keys and cpu tensor values.
+        datatype: torch data type of all tensors in data associated
+                  with keys.
+    """
+    # Build (key, size) and (key, number of elements) dictionaries along
+    # with the total number of elements on all ranks.
+    key_size, key_numel, total_numel = _build_key_size_numel_dictionaries(keys, data)
+
+    # Pack on rank zero.
+    if get_tensor_model_parallel_rank() == 0:
+        # Check that all keys have the same data type.
+        _check_data_types(keys, data, datatype)
+        # Flatten the data associated with the keys
+        flatten_data = torch.cat([data[key].contiguous().view(-1) for key in keys], dim=0).cuda()
+    else:
+        flatten_data = torch.empty(total_numel, device=torch.cuda.current_device(), dtype=datatype)
+
+    # Broadcast
+    torch.distributed.broadcast(
+        flatten_data, get_tensor_model_parallel_src_rank(), group=get_tensor_model_parallel_group()
+    )
+
+    # Unpack
+    output = {}
+    offset = 0
+    for key in keys:
+        size = key_size[key]
+        numel = key_numel[key]
+        output[key] = flatten_data.narrow(0, offset, numel).view(size)
+        offset += numel
+
+    return output
\ No newline at end of file
diff --git a/megatron_patch/tokenizer/__init__.py b/megatron_patch/tokenizer/__init__.py
index ac3797a..e20f26c 100644
--- a/megatron_patch/tokenizer/__init__.py
+++ b/megatron_patch/tokenizer/__init__.py
@@ -263,8 +263,8 @@ def __call__(self, text, return_tensors=None,
                 return self.tokenizer(text, return_tensors=return_tensors, padding=padding,
                         max_length=max_length, truncation=truncation, add_special_tokens=add_special_tokens)
 
-            def apply_chat_template(self, conversations, tokenize:bool=True):
-                return self.tokenizer.apply_chat_template(conversations, tokenize=tokenize)
+            def apply_chat_template(self, conversations, tokenize:bool=True, **kwargs):
+                return self.tokenizer.apply_chat_template(conversations, tokenize=tokenize, **kwargs)
             
             @property
             def vocab_size(self):
@@ -315,6 +315,9 @@ def vision_start_token_id(self):
             @property
             def vision_end_token_id(self):
                 return self.special_tokens_map[self.vision_end_token]
+            
+            def encode(self, x):
+                return self.tokenizer.encode(x)
 
         tokenizer = _Qwen2VLTokenizer(args.load, args.extra_vocab_size)
         args.padded_vocab_size = tokenizer.vocab_size
diff --git a/toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl.py b/toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl.py
index 1bfc8fc..613171d 100644
--- a/toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl.py
+++ b/toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl.py
@@ -1,3 +1,16 @@
+# Copyright (c) 2024 Alibaba PAI Team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import os
 import re
 import json
@@ -12,7 +25,14 @@
     AutoTokenizer,
 )
 
-from transformers.modeling_utils import WEIGHTS_INDEX_NAME, WEIGHTS_NAME, shard_checkpoint, load_sharded_checkpoint
+from transformers.modeling_utils import (
+    WEIGHTS_INDEX_NAME, 
+    WEIGHTS_NAME, 
+    SAFE_WEIGHTS_INDEX_NAME,
+    SAFE_WEIGHTS_NAME,
+    shard_checkpoint, 
+    load_sharded_checkpoint,
+)
 from megatron.training.initialize import initialize_megatron
 from megatron.training import get_args
 from megatron.training.checkpointing import get_checkpoint_name, get_checkpoint_tracker_filename, read_metadata
@@ -77,26 +97,13 @@ def add_model_args(parser):
 
     parser.add_argument(
         "--save-safetensors",
-        action='store_false',
+        action='store_true',
     )
 
     return parser
 
-
-def name_to_expert_rank(key):
-    pattern = r'local_experts\.(\d+)\.'
-    expert_rank = int(re.findall(pattern, key)[0])
-    return expert_rank
-
 def load_megatron_model(args):
-    args.tensor_model_parallel_size = args.target_tensor_model_parallel_size
-    args.pipeline_model_parallel_size = args.target_pipeline_model_parallel_size
-
-    if args.tensor_model_parallel_size >1:
-        args.sequence_parallel = True
-
-    assert args.num_query_groups >= args.target_tensor_model_parallel_size
-
+    """load a TPxPPx checkpoint into a TP1PP1 model."""
     os.makedirs(args.save, exist_ok=True)
     os.system("cp -rf " + args.hf_ckpt_path + "/config*.json " + args.save)
     os.system("cp -rf " + args.hf_ckpt_path + "/generation_config.json " + args.save)
@@ -111,15 +118,17 @@ def load_megatron_model(args):
     os.system("cp -rf " + args.hf_ckpt_path + "/merges.txt " + args.load)
 
     model = model_provider()
-
+    args.tensor_model_parallel_size = args.target_tensor_model_parallel_size
+    args.pipeline_model_parallel_size = args.target_pipeline_model_parallel_size
 
     model_path = args.load
     tracker_filename = get_checkpoint_tracker_filename(model_path)
     iteration, release = read_metadata(tracker_filename)
+
     head_dim = args.hidden_size // args.num_attention_heads
     group_per_split = args.num_query_groups // args.tensor_model_parallel_size
-    if args.num_experts is not None:
-        num_local_experts = args.num_experts // args.expert_model_parallel_size
+    
+    vision_state_dicts = defaultdict(dict)
     state_dict = {}
     mid_state = defaultdict(list)
     if (
@@ -132,14 +141,16 @@ def load_megatron_model(args):
     elif (
         args.tensor_model_parallel_size > 1
         and args.pipeline_model_parallel_size == 1
-        and args.num_experts is None
     ):  
         for tp_rank in range(args.tensor_model_parallel_size):
             checkpoint_name = get_checkpoint_name(model_path, iteration, release, None, tp_rank, None, None, None)
             print(f'load {checkpoint_name}')
             split_state = torch.load(checkpoint_name, map_location="cpu")['model']
             for k, v in split_state.items():
-                mid_state[k].append(v)
+                if k.startswith('vision_model'):
+                    vision_state_dicts[(tp_rank, 0)][k] = v
+                else:
+                    mid_state[k].append(v)
         for k, v in mid_state.items():
             if not isinstance(v[0], torch.Tensor) or 'norm' in k:
                 target_v = v[0]
@@ -160,9 +171,7 @@ def load_megatron_model(args):
                 raise ValueError
             state_dict[k] = target_v
     elif (
-        args.tensor_model_parallel_size > 1
-        and args.pipeline_model_parallel_size > 1
-        and args.num_experts is None
+        args.pipeline_model_parallel_size > 1
     ):  
         num_layers = args.num_layers // args.pipeline_model_parallel_size
         layers_to_copy = {}
@@ -176,6 +185,10 @@ def load_megatron_model(args):
                 print(f'load {checkpoint_name}')
                 split_state = torch.load(checkpoint_name, map_location="cpu")['model']
                 for k, v in split_state.items():
+                    if k.startswith('vision_model'):
+                        assert pp_rank == 0
+                        vision_state_dicts[(tp_rank, 0)][k] = v
+                        continue
                     try:
                         pattern = re.compile(r'\d+')
                         res = pattern.findall(k)
@@ -203,128 +216,10 @@ def load_megatron_model(args):
                 raise ValueError
             state_dict[k] = target_v
 
-    elif (
-        args.tensor_model_parallel_size == 1
-        and args.pipeline_model_parallel_size == 1
-        and args.expert_model_parallel_size >1
-        and args.num_experts % args.expert_model_parallel_size == 0
-    ):
-        for ep_rank in range(args.expert_model_parallel_size):
-            checkpoint_name = get_checkpoint_name(model_path, iteration, release, None, None, None, True, ep_rank)
-            print(f'load {checkpoint_name}')
-            split_state = torch.load(checkpoint_name, map_location="cpu")['model']
-            for k, v in split_state.items():
-                if 'local_experts' in k:
-                    expert_local_rank = name_to_expert_rank(k)
-                    expert_rank = expert_local_rank + num_local_experts * ep_rank
-                    k = k.replace(f'local_experts.{expert_local_rank}', f'local_experts.{expert_rank}')
-                state_dict[k] = v
-    elif (
-        args.tensor_model_parallel_size > 1
-        and args.pipeline_model_parallel_size == 1
-        and args.expert_model_parallel_size > 1
-        and args.num_experts % args.expert_model_parallel_size == 0
-    ):
-        for tp_rank in range(args.tensor_model_parallel_size):
-            for ep_rank in range(args.expert_model_parallel_size):
-                if args.expert_model_parallel_size >1:
-                    checkpoint_name = get_checkpoint_name(model_path, iteration,release, None, tp_rank, None, True, ep_rank)
-                elif args.expert_model_parallel_size ==1:
-                    checkpoint_name = get_checkpoint_name(model_path, iteration, release, None, tp_rank, None, False)
-                print(f'load {checkpoint_name}')
-                split_state = torch.load(checkpoint_name, map_location="cpu")['model']
-                for k, v in split_state.items():
-                    if 'local_experts' in k and 'norm' not in k:
-                        local_expert_rank = name_to_expert_rank(k)
-                        expert_rank = local_expert_rank + num_local_experts * ep_rank
-                        k = k.replace(f'local_experts.{local_expert_rank}', f'local_experts.{expert_rank}')
-                        mid_state[k].append(v)
-                    elif ep_rank == 0:
-                        mid_state[k].append(v)
-
-        for k, v in mid_state.items():
-            if not isinstance(v[0], torch.Tensor) or 'norm' in k or 'router' in k or 'gate' in k:
-                target_v = v[0]
-            elif 'embedding' in k or 'output_layer' in k:
-                target_v = torch.cat(v, dim=0)
-            elif 'linear_proj' in k or 'linear_fc2' in k:
-                target_v = torch.cat(v, dim=1)
-            elif 'linear_qkv.weight' in k:
-                viewed = [x.view(group_per_split, -1, head_dim, args.hidden_size) for x in v]
-                target_v = torch.cat(viewed, dim=0).view(-1, args.hidden_size)
-            elif 'linear_qkv.bias' in k:
-                viewed = [x.view(group_per_split, -1) for x in v]
-                target_v = torch.cat(viewed, dim=0).view(-1)
-            elif 'linear_fc1' in k:
-                viewed = [x.view(2, -1, args.hidden_size) for x in v]
-                target_v = torch.cat(viewed, dim=1).view(-1, args.hidden_size)
-            else:
-                print('passed', k)
-                exit()
-            state_dict[k] = target_v
-
-    elif (
-        args.tensor_model_parallel_size > 1
-        and args.pipeline_model_parallel_size > 1
-        and args.expert_model_parallel_size > 1
-        and args.num_experts % args.expert_model_parallel_size == 0
-    ):
-        num_layers = args.num_layers // args.pipeline_model_parallel_size
-        layers_to_copy = {}
-        for tp_rank in range(args.tensor_model_parallel_size):
-            for ep_rank in range(args.expert_model_parallel_size):
-                for pp_rank in range(args.pipeline_model_parallel_size):
-                    layer_offset = pp_rank * num_layers
-                    for layer in range(num_layers):
-                        pp_layer_id = layer + layer_offset
-                        layers_to_copy[f"decoder.layers.{layer}"] = pp_layer_id
-
-                    if args.expert_model_parallel_size > 1:
-                        checkpoint_name = get_checkpoint_name(model_path, iteration, release, True, tp_rank, pp_rank, True,
-                                                              ep_rank)
-                    elif args.expert_model_parallel_size == 1:
-                        checkpoint_name = get_checkpoint_name(model_path, iteration, release, True, tp_rank, pp_rank,
-                                                              False)
-                    print(f'load {checkpoint_name}')
-                    split_state = torch.load(checkpoint_name, map_location="cpu")['model']
-                    for k, v in split_state.items():
-                        if 'local_experts' in k and 'norm' not in k:
-                            local_expert_rank = name_to_expert_rank(k)
-                            expert_rank = local_expert_rank + num_local_experts * ep_rank
-                            k = k.replace(f'local_experts.{local_expert_rank}', f'local_experts.{expert_rank}')
-                            mid_state[k].append(v)
-                        elif ep_rank == 0:
-                            mid_state[k].append(v)
-                        try:
-                            pattern = re.compile(r'\d+')
-                            res = pattern.findall(k)
-                            k = re.sub(r"decoder.layers.\d+", "decoder.layers." + str(layers_to_copy["decoder.layers." + res[0]]), k)
-                            mid_state[k].append(v)
-                        except:
-                            mid_state[k].append(v)
-        for k, v in mid_state.items():
-            if not isinstance(v[0], torch.Tensor) or 'norm' in k or 'router' in k or 'gate' in k:
-                target_v = v[0]
-            elif 'embedding' in k or 'output_layer' in k:
-                target_v = torch.cat(v, dim=0)
-            elif 'linear_proj' in k or 'linear_fc2' in k:
-                target_v = torch.cat(v, dim=1)
-            elif 'linear_qkv.weight' in k:
-                viewed = [x.view(group_per_split, -1, head_dim, args.hidden_size) for x in v]
-                target_v = torch.cat(viewed, dim=0).view(-1, args.hidden_size)
-            elif 'linear_qkv.bias' in k:
-                viewed = [x.view(group_per_split, -1) for x in v]
-                target_v = torch.cat(viewed, dim=0).view(-1)
-            elif 'linear_fc1' in k:
-                viewed = [x.view(2, -1, args.hidden_size) for x in v]
-                target_v = torch.cat(viewed, dim=1).view(-1, args.hidden_size)
-            else:
-                raise ValueError
-            state_dict[k] = target_v
-
     else:
         raise ValueError('not support yet')
 
+    load_split_state_dict_to_vision_model(vision_state_dicts, model.vision_model, args)
     model.load_state_dict(state_dict, strict=False)
     return model
 
@@ -334,7 +229,6 @@ def load_megatron_model(args):
 
 @torch.inference_mode()
 def convert_checkpoint_from_megatron_to_transformers(mgmodel, hfmodel, args):
-    raise NotImplementedError()
     if args.fp16:
         mgmodel = mgmodel.half()
         hfmodel = hfmodel.half()
@@ -350,63 +244,95 @@ def convert_checkpoint_from_megatron_to_transformers(mgmodel, hfmodel, args):
     q_dim_per_group = hidden_size // num_query_groups
     kv_dim_per_group = head_dim
 
+    # 1. vision model
+    hfvision = hfmodel.visual
+    mgvision = mgmodel.vision_model
+    vision_hidden_size = mgvision.config.hidden_size
+    vision_num_query_groups = mgvision.config.num_query_groups
+    vision_head_dim = vision_hidden_size // mgvision.config.num_attention_heads
+    copied_numel = 0
+    safe_copy(mgvision.rotary_pos_emb.inv_freq, hfvision.rotary_pos_emb.inv_freq)
+    copied_numel += safe_copy(mgvision.patch_embed.proj.weight, hfvision.patch_embed.proj.weight)
+    for hfblock, mgblock in zip(hfvision.blocks, mgvision.decoder.layers):
+        # linear_qkv.norm --> norm1
+        copied_numel += safe_copy(mgblock.self_attention.linear_qkv.layer_norm_weight, hfblock.norm1.weight)
+        copied_numel += safe_copy(mgblock.self_attention.linear_qkv.layer_norm_bias, hfblock.norm1.bias)
+        # mlp.linear_fc1.norm --> norm2
+        copied_numel += safe_copy(mgblock.mlp.linear_fc1.layer_norm_weight, hfblock.norm2.weight)
+        copied_numel += safe_copy(mgblock.mlp.linear_fc1.layer_norm_bias, hfblock.norm2.bias)       
+        # self_attention.linear_qkv --> qkv
+        converted_weight = (
+            mgblock.self_attention.linear_qkv.weight
+            .view(vision_num_query_groups, 3, -1, vision_head_dim, vision_hidden_size)
+            .transpose(0, 1)
+            .reshape(-1, vision_hidden_size)
+            .contiguous()
+        )
+        copied_numel += safe_copy(converted_weight, hfblock.attn.qkv.weight)
+        converted_bias = (
+            mgblock.self_attention.linear_qkv.bias
+            .view(vision_num_query_groups, 3, -1)
+            .transpose(0, 1)
+            .reshape(-1)
+            .contiguous()
+        )
+        copied_numel += safe_copy(converted_bias, hfblock.attn.qkv.bias)
+        # self_attention.linear_proj --> proj
+        copied_numel += safe_copy(mgblock.self_attention.linear_proj.weight, hfblock.attn.proj.weight)
+        copied_numel += safe_copy(mgblock.self_attention.linear_proj.bias, hfblock.attn.proj.bias)
+        # mlp --> mlp: no gate
+        copied_numel += safe_copy(mgblock.mlp.linear_fc1.weight, hfblock.mlp.fc1.weight)
+        copied_numel += safe_copy(mgblock.mlp.linear_fc1.bias, hfblock.mlp.fc1.bias)
+        copied_numel += safe_copy(mgblock.mlp.linear_fc2.weight, hfblock.mlp.fc2.weight)
+        copied_numel += safe_copy(mgblock.mlp.linear_fc2.bias, hfblock.mlp.fc2.bias)        
 
-    hfmodel.model.embed_tokens.weight.copy_(mgmodel.embedding.word_embeddings.weight)
-
-    for mglayer, hflayer in zip(mgmodel.decoder.layers, hfmodel.model.layers):
-        if use_te:
-            hflayer.input_layernorm.weight.copy_(mglayer.self_attention.linear_qkv.layer_norm_weight)
-        else:
-            hflayer.input_layernorm.weight.copy_(mglayer.input_layernorm.weight)
+    hfprojector = hfvision.merger
+    mgprojector = mgvision.projection
+    copied_numel += safe_copy(mgvision.decoder.final_layernorm.weight, hfprojector.ln_q.weight)
+    copied_numel += safe_copy(mgvision.decoder.final_layernorm.bias, hfprojector.ln_q.bias)   
+
+    copied_numel += safe_copy(mgprojector.encoder.linear_fc1.weight, hfprojector.mlp[0].weight)
+    copied_numel += safe_copy(mgprojector.encoder.linear_fc1.bias, hfprojector.mlp[0].bias)
+    copied_numel += safe_copy(mgprojector.encoder.linear_fc2.weight, hfprojector.mlp[2].weight)
+    copied_numel += safe_copy(mgprojector.encoder.linear_fc2.bias, hfprojector.mlp[2].bias)
+    n_params = sum([t.numel() for t in mgvision.state_dict().values() if isinstance(t, torch.Tensor)])
+    assert n_params == copied_numel
 
+    # 3. llm [just Qwen2]
+    hfllm = hfmodel.model
+    mgllm = mgmodel.language_model
+    copied_numel = 0
+    copied_numel += safe_copy(mgllm.embedding.word_embeddings.weight, hfllm.embed_tokens.weight)
+    for mglayer, hflayer in zip(mgllm.decoder.layers, hfllm.layers):
+        copied_numel += safe_copy(mglayer.self_attention.linear_qkv.layer_norm_weight, hflayer.input_layernorm.weight)
+        
         qkv_weight = mglayer.self_attention.linear_qkv.weight.view(num_query_groups, -1, head_dim, hidden_size)
         q_weight, k_weight, v_weight = torch.split(qkv_weight, split_size_or_sections=[value_num_per_group, 1, 1], dim=1)
-        hflayer.self_attn.q_proj.weight.copy_(q_weight.reshape(-1, hidden_size))
-        hflayer.self_attn.k_proj.weight.copy_(k_weight.reshape(-1, hidden_size))
-        hflayer.self_attn.v_proj.weight.copy_(v_weight.reshape(-1, hidden_size))
-
+        copied_numel += safe_copy(q_weight.reshape(-1, hidden_size), hflayer.self_attn.q_proj.weight)
+        copied_numel += safe_copy(k_weight.reshape(-1, hidden_size), hflayer.self_attn.k_proj.weight)
+        copied_numel += safe_copy(v_weight.reshape(-1, hidden_size), hflayer.self_attn.v_proj.weight)
+        
         qkv_bias = mglayer.self_attention.linear_qkv.bias.view(num_query_groups, -1)
         q_bias, k_bias, v_bias = torch.split(qkv_bias, split_size_or_sections=[q_dim_per_group, kv_dim_per_group, kv_dim_per_group], dim=1)
-        q_bias = q_bias.contiguous().view(-1)
-        k_bias = k_bias.contiguous().view(-1)
-        v_bias = v_bias.contiguous().view(-1)
+        copied_numel += safe_copy(q_bias.contiguous().view(-1), hflayer.self_attn.q_proj.bias)
+        copied_numel += safe_copy(k_bias.contiguous().view(-1), hflayer.self_attn.k_proj.bias)
+        copied_numel += safe_copy(v_bias.contiguous().view(-1), hflayer.self_attn.v_proj.bias)
 
-        hflayer.self_attn.q_proj.bias.copy_(q_bias)
-        hflayer.self_attn.k_proj.bias.copy_(k_bias)
-        hflayer.self_attn.v_proj.bias.copy_(v_bias)
+        copied_numel += safe_copy(mglayer.self_attention.linear_proj.weight, hflayer.self_attn.o_proj.weight)
 
-        hflayer.self_attn.o_proj.weight.copy_(mglayer.self_attention.linear_proj.weight)
+        gate_weight, fc1_weight = torch.split(mglayer.mlp.linear_fc1.weight, split_size_or_sections=args.ffn_hidden_size)
+        copied_numel += safe_copy(gate_weight, hflayer.mlp.gate_proj.weight)
+        copied_numel += safe_copy(fc1_weight, hflayer.mlp.up_proj.weight)
+        copied_numel += safe_copy(mglayer.mlp.linear_fc2.weight, hflayer.mlp.down_proj.weight)
 
-        if args.num_experts is None:
-            gate_weight, fc1_weight = torch.split(mglayer.mlp.linear_fc1.weight, split_size_or_sections=args.ffn_hidden_size)
-            hflayer.mlp.gate_proj.weight.copy_(gate_weight)
-            hflayer.mlp.up_proj.weight.copy_(fc1_weight)
-            hflayer.mlp.down_proj.weight.copy_(mglayer.mlp.linear_fc2.weight)
-        else:
-            hflayer.mlp.gate.weight.copy_(mglayer.mlp.router.weight)
-            for mgexpert, hfexpert in zip(mglayer.mlp.experts.local_experts, hflayer.mlp.experts):
-                gate_weight, up_weight = torch.split(mgexpert.linear_fc1.weight,
-                                                        split_size_or_sections=args.moe_ffn_hidden_size)
-                hfexpert.gate_proj.weight.copy_(gate_weight)
-                hfexpert.up_proj.weight.copy_(up_weight)
-                hfexpert.down_proj.weight.copy_(mgexpert.linear_fc2.weight)
-
-            hflayer.mlp.shared_expert_gate.weight.copy_(mglayer.mlp.shared_expert_gate.weight)
-            shared_expert_gate_weight, shared_expert_up_weight = \
-                torch.split(mglayer.mlp.shared_expert.linear_fc1.weight,
-                            split_size_or_sections=args.shared_moe_ffn_hidden_size)
-            hflayer.mlp.shared_expert.gate_proj.weight.copy_(shared_expert_gate_weight)
-            hflayer.mlp.shared_expert.up_proj.weight.copy_(shared_expert_up_weight)
-            hflayer.mlp.shared_expert.down_proj.weight.copy_(mglayer.mlp.shared_expert.linear_fc2.weight)
-
-        if use_te and not args.num_experts:
-            hflayer.post_attention_layernorm.weight.copy_(mglayer.mlp.linear_fc1.layer_norm_weight)
-        else:
-            hflayer.post_attention_layernorm.weight.copy_(mglayer.pre_mlp_layernorm.weight)
+        copied_numel += safe_copy(mglayer.mlp.linear_fc1.layer_norm_weight, hflayer.post_attention_layernorm.weight)
 
-    hfmodel.model.norm.weight.copy_(mgmodel.decoder.final_layernorm.weight)
+    copied_numel += safe_copy(mgllm.decoder.final_layernorm.weight, hfllm.norm.weight)
     if args.untie_embeddings_and_output_weights:
-        hfmodel.lm_head.weight.copy_(mgmodel.output_layer.weight)
+        safe_copy(mgllm.output_layer.weight, hfmodel.lm_head.weight)
+    
+    n_params = sum([t.numel() for t in hfllm.state_dict().values() if isinstance(t, torch.Tensor)])
+    assert n_params == copied_numel
 
 def safe_copy(src_tensor: torch.Tensor, dst_tensor: torch.Tensor):
     assert src_tensor.dtype == dst_tensor.dtype
@@ -549,7 +475,6 @@ def split_vision_model(mgvision, args, prefix="vision_model") -> Dict[Tuple, Dic
     vision_ffn_hidden_size = mgvision.config.ffn_hidden_size
     head_dim = vision_hidden_size // ENCODER_NUM_ATTENTION_HEADS
     assert ENCODER_NUM_ATTENTION_HEADS % tp == 0
-    assert tp > 1, "TP1 should not call this function!"
     # split model with ETP
     for etp_rank in range(tp):
         d = {}
@@ -587,6 +512,70 @@ def split_vision_model(mgvision, args, prefix="vision_model") -> Dict[Tuple, Dic
         state_dicts[(etp_rank, 0)] = d
     return state_dicts
 
+def load_split_state_dict_to_vision_model(state_dicts, mgvision, args):
+    tp = args.tensor_model_parallel_size
+    ENCODER_NUM_ATTENTION_HEADS = 16
+
+    num_query_groups = mgvision.config.num_query_groups
+    group_per_split = num_query_groups // tp
+
+    vision_hidden_size = mgvision.config.hidden_size
+    vision_ffn_hidden_size = mgvision.config.ffn_hidden_size
+    head_dim = vision_hidden_size // ENCODER_NUM_ATTENTION_HEADS
+
+    merged_dict = defaultdict(list)
+    # merge model by etp
+    for etp_rank in range(tp):
+        d = state_dicts[(etp_rank, 0)]
+        for k, v in d.items():
+            # NOTE: remove prefix
+            k = '.'.join(k.split('.')[1:])
+            if not isinstance(v, torch.Tensor):
+                if etp_rank == 0:
+                    merged_dict[k].append(v)
+            elif 'patch_embed' in k:
+                if etp_rank == 0:
+                    merged_dict[k].append(v)
+            elif 'linear_qkv.weight' in k:
+                merged_dict[k].append(v.view(group_per_split, -1, head_dim, vision_hidden_size))
+            elif 'linear_qkv.bias' in k:
+                merged_dict[k].append(v.view(group_per_split, -1, head_dim))
+            elif ('linear_proj' in k or 'linear_fc2' in k) and 'bias' not in k:
+                merged_dict[k].append(v)
+            elif 'linear_fc1.weight' in k and 'projection' not in k:
+                seg = vision_ffn_hidden_size // tp
+                merged_dict[k].append(v.view(-1, seg, vision_hidden_size))
+            elif 'linear_fc1.weight' in k:
+                seg = vision_ffn_hidden_size // tp
+                merged_dict[k].append(v.view(-1, seg, vision_ffn_hidden_size))  
+            elif 'linear_fc1.bias' in k:
+                seg = vision_ffn_hidden_size // tp
+                merged_dict[k].append(v.view(-1, seg))  
+            elif etp_rank == 0:
+                merged_dict[k].append(v)
+
+    for k, v in merged_dict.items():
+        if not isinstance(v[0], torch.Tensor):
+            merged_dict[k] = v[0]
+        elif 'patch_embed' in k:
+            merged_dict[k] = v[0]
+        elif 'linear_qkv.weight' in k:
+            merged_dict[k] = torch.cat(v, dim=0).view(-1, vision_hidden_size)
+        elif 'linear_qkv.bias' in k:
+            merged_dict[k] = torch.cat(v, dim=0).view(-1)
+        elif ('linear_proj' in k or 'linear_fc2' in k) and 'bias' not in k:
+            merged_dict[k] = torch.cat(v, dim=-1)
+        elif 'linear_fc1.weight' in k and 'projection' not in k:
+            merged_dict[k] = torch.cat(v, dim=1).view(-1, vision_hidden_size)
+        elif 'linear_fc1.weight' in k:
+            merged_dict[k] = torch.cat(v, dim=1).view(-1, vision_ffn_hidden_size)
+        elif 'linear_fc1.bias' in k:
+            merged_dict[k] = torch.cat(v, dim=1).view(-1)
+        else:
+            merged_dict[k] = v[0]
+
+    mgvision.load_state_dict(merged_dict, strict=False)
+
 def save_mgmodel(mgmodel, args):
     args.tensor_model_parallel_size = args.target_tensor_model_parallel_size
     args.pipeline_model_parallel_size = args.target_pipeline_model_parallel_size
@@ -717,32 +706,25 @@ def save_mgmodel(mgmodel, args):
 
     print(f'megatron model is save to {args.save}')
 
-
-def save_hfmodel(args, model):
+def save_hfmodel(args, model, max_shard_size='10GB'):
     output_state_dict = model.state_dict()
-    max_shard_size = "10GB"
-    # TODO: fix safetensor save
-    shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size)
+    weight_file = SAFE_WEIGHTS_NAME if args.save_safetensors else WEIGHTS_NAME
+    index_file = SAFE_WEIGHTS_INDEX_NAME if args.save_safetensors else WEIGHTS_INDEX_NAME
+
+    shards, index = shard_checkpoint(output_state_dict, max_shard_size=max_shard_size, weights_name=weight_file)
     os.makedirs(args.save, exist_ok=True)
     for shard_file, shard in shards.items():
+        target_file = os.path.join(args.save, shard_file)
+        print(f'huggingface model is save to {target_file}')
         if args.save_safetensors:
-            shard_file = shard_file.replace("pytorch_", "")
-            shard_file = shard_file.replace(".bin", ".safetensors")
-            target_file = os.path.join(args.save, shard_file)
-            print(f'huggingface model is save to {target_file}')
-            new_shard = {}
-            for k, v in shard.items():
-                new_shard[k] = copy.deepcopy(v)
-            safetensors.torch.save_file(clone_state_dict(new_shard), target_file, metadata={"format": "pt"})
+            safetensors.torch.save_file(clone_state_dict(shard), target_file, metadata={"format": "pt"})
         else:
-            target_file = os.path.join(args.save, shard_file)
-            print(f'huggingface model is save to {target_file}')
             torch.save(clone_state_dict(shard), target_file)
 
     if index is None:
-        print(f"Model weights saved in {os.path.join(args.save, WEIGHTS_NAME)}")
+        print(f"Model weights saved in {os.path.join(args.save, weight_file)}") # do nothing
     else:
-        save_index_file = os.path.join(args.save, WEIGHTS_INDEX_NAME)
+        save_index_file = os.path.join(args.save, index_file)
         # Save the index as well
         with open(save_index_file, "w", encoding="utf-8") as f:
             content = json.dumps(index, indent=2, sort_keys=True) + "\n"
@@ -753,231 +735,6 @@ def save_hfmodel(args, model):
             f"index located at {save_index_file}."
         )
 
-def check_hf_mg_forward(hfmodel, mgmodel, mgargs):
-    print(hfmodel)
-    print(mgmodel)
-
-    hf_hiddens = [{} for _ in range(mgargs.num_layers)]
-    mg_hiddens = [{} for _ in range(mgargs.num_layers)]
-
-    vision_hidden_size = mgmodel.vision_model.config.hidden_size
-    hidden_size = mgargs.hidden_size
-    vocab_size = mgargs.padded_vocab_size
-    vision_ffn_hidden_size = mgmodel.vision_model.config.ffn_hidden_size
-    def print_input_hook(module, args, kwargs, layer_idx, mode):
-        frame, name, *others = mode.split('-')
-        if len(others) > 0:
-            n = '-'.join([name, *others])
-            if frame == 'hf':
-                hf_hiddens[layer_idx][n] = args[0][:, None]
-            elif frame == 'mg' and 'layer' in mode:
-                mg_hiddens[layer_idx][n] = kwargs.get('hidden_states')
-            elif frame == 'mg':
-                mg_hiddens[layer_idx][n] = args[0]
-        elif frame == 'hf':
-            hf_hiddens[layer_idx][name] = args[0].transpose(0, 1)
-        elif frame == 'mg' and 'layer' in mode:
-            mg_hiddens[layer_idx][name] = kwargs.get('hidden_states')
-        elif frame == 'mg':
-            mg_hiddens[layer_idx][name] = args[0]
-
-    def print_output_hook(module, args, kwargs, output, layer_idx, mode):
-        frame, name, *others = mode.split('-')
-        if mode in ['hf-lmhead']:
-            hf_hiddens[layer_idx][name] = output.transpose(0, 1).reshape(-1, vocab_size)
-            hf_hiddens[layer_idx][name + "_weight"] = module.weight
-            hf_hiddens[layer_idx][name + '_token'] = output.transpose(0, 1).max(dim=-1)[1]
-        elif mode in ['mg-lmhead']:
-            mg_hiddens[layer_idx][name] = output[0].reshape(-1, vocab_size)
-            mg_hiddens[layer_idx][name + "_weight"] = module.weight
-            mg_hiddens[layer_idx][name + '_token'] = output[0].max(dim=-1)[1]
-        elif mode in ['hf-o_proj_out']:
-            hf_hiddens[layer_idx][name] = output
-            hf_hiddens[layer_idx][name + '_weight'] = module.weight
-        elif mode in ['mg-o_proj_out']:
-            mg_hiddens[layer_idx][name] = output[0].reshape(-1, hidden_size)
-            mg_hiddens[layer_idx][name + '_weight'] = module.weight
-        elif mode in ['hf-attn_out']:
-            hf_hiddens[layer_idx][name] = output[0].reshape(-1, hidden_size)
-        elif mode in ['mg-attn_out']:
-            mg_hiddens[layer_idx][name] = output[0].reshape(-1, hidden_size)
-        elif mode in ['hf-patchembed']:
-            hf_hiddens[layer_idx][name] = output
-        elif mode in ['mg-patchembed']:
-            mg_hiddens[layer_idx][name] = output
-        elif len(others) > 0:
-            n = '-'.join([name, *others])
-            if name in ['o_proj_out', 'mlp_out']:
-                if frame == 'hf':
-                    hf_hiddens[layer_idx][n] = output
-                else:
-                    if isinstance(output, (list, tuple)):
-                        output = output[0]
-                    mg_hiddens[layer_idx][n] = output.reshape(-1, vision_hidden_size)
-            elif name == 'attn_out':
-                if frame == 'hf':
-                    hf_hiddens[layer_idx][n] = output.reshape(-1, vision_hidden_size)
-                else:
-                    mg_hiddens[layer_idx][n] = (output[0] + output[1]).reshape(-1, vision_hidden_size)
-            elif name == 'fc1_out':
-                if frame == 'hf':
-                    hf_hiddens[layer_idx][n] = output
-                else:
-                    if isinstance(output, (list, tuple)):
-                        output = output[0]
-                    mg_hiddens[layer_idx][n] = output.reshape(-1, vision_ffn_hidden_size)
-    # vision hook: patch_embed, decoder, projection
-    hfmodel.visual.patch_embed.register_forward_hook(
-        partial(print_output_hook, layer_idx=0, mode='hf-patchembed'), with_kwargs=True)
-
-    mgmodel.vision_model.patch_embed.register_forward_hook(
-        partial(print_output_hook, layer_idx=0, mode='mg-patchembed'), with_kwargs=True)
-
-    for idx, layer in enumerate(hfmodel.visual.blocks):
-        layer.register_forward_pre_hook(partial(print_input_hook, layer_idx=0, mode=f'hf-layer_in-vision{idx}'), with_kwargs=True)
-
-        layer.attn.proj.register_forward_pre_hook(partial(print_input_hook, layer_idx=0, mode=f'hf-o_proj_in-vision{idx}'),
-                                                         with_kwargs=True)
-
-        layer.attn.proj.register_forward_hook(partial(print_output_hook, layer_idx=0, mode=f'hf-o_proj_out-vision{idx}'),
-                                                     with_kwargs=True)
-
-        layer.attn.register_forward_hook(partial(print_output_hook, layer_idx=0, mode=f'hf-attn_out-vision{idx}'),
-                                              with_kwargs=True)
-        
-        layer.mlp.fc1.register_forward_hook(partial(print_output_hook, layer_idx=0, mode=f'hf-fc1_out-vision{idx}'),
-                                              with_kwargs=True)
-        
-        layer.mlp.register_forward_hook(partial(print_output_hook, layer_idx=0, mode=f'hf-mlp_out-vision{idx}'),
-                                              with_kwargs=True)
-
-
-
-    for idx, layer in enumerate(mgmodel.vision_model.decoder.layers):
-        layer.register_forward_pre_hook(partial(print_input_hook, layer_idx=0, mode=f'mg-layer_in-vision{idx}'), with_kwargs=True)
-
-        layer.self_attention.linear_proj.register_forward_pre_hook(
-            partial(print_input_hook, layer_idx=0, mode=f'mg-o_proj_in-vision{idx}'), with_kwargs=True)
-
-        layer.self_attention.linear_proj.register_forward_hook(
-            partial(print_output_hook, layer_idx=0, mode=f'mg-o_proj_out-vision{idx}'), with_kwargs=True)
-
-        layer.self_attention.register_forward_hook(partial(print_output_hook, layer_idx=0, mode=f'mg-attn_out-vision{idx}'),
-                                                   with_kwargs=True)
-            
-        layer.mlp.linear_fc1.register_forward_hook(partial(print_output_hook, layer_idx=0, mode=f'mg-fc1_out-vision{idx}'),
-                                                   with_kwargs=True)
-
-        layer.mlp.register_forward_hook(partial(print_output_hook, layer_idx=0, mode=f'mg-mlp_out-vision{idx}'),
-                                              with_kwargs=True)
-
-
-
-    hfmodel.visual.merger.ln_q.register_forward_hook(
-        partial(print_output_hook, layer_idx=0, mode='hf-finalln'), with_kwargs=True)
-
-    mgmodel.vision_model.decoder.final_layernorm.register_forward_hook(
-        partial(print_output_hook, layer_idx=0, mode='mg-finalln'), with_kwargs=True)
-
-    if mgargs.untie_embeddings_and_output_weights:
-        hfmodel.lm_head.register_forward_hook(partial(print_output_hook, layer_idx=mgargs.num_layers - 1, mode='hf-lmhead'),
-                                            with_kwargs=True)
-
-        mgmodel.language_model.output_layer.register_forward_hook(
-            partial(print_output_hook, layer_idx=mgargs.num_layers - 1, mode='mg-lmhead'), with_kwargs=True)
-
-    for idx, layer in enumerate(hfmodel.model.layers):
-
-        layer.register_forward_pre_hook(partial(print_input_hook, layer_idx=idx, mode='hf-layer_in'), with_kwargs=True)
-
-        layer.self_attn.o_proj.register_forward_pre_hook(partial(print_input_hook, layer_idx=idx, mode='hf-o_proj_in'),
-                                                         with_kwargs=True)
-
-        layer.self_attn.o_proj.register_forward_hook(partial(print_output_hook, layer_idx=idx, mode='hf-o_proj_out'),
-                                                     with_kwargs=True)
-
-        layer.self_attn.register_forward_hook(partial(print_output_hook, layer_idx=idx, mode='hf-attn_out'),
-                                              with_kwargs=True)
-
-
-    for idx, layer in enumerate(mgmodel.language_model.decoder.layers):
-
-        layer.register_forward_pre_hook(partial(print_input_hook, layer_idx=idx, mode='mg-layer_in'), with_kwargs=True)
-
-        layer.self_attention.linear_proj.register_forward_pre_hook(
-            partial(print_input_hook, layer_idx=idx, mode='mg-o_proj_in'), with_kwargs=True)
-
-        layer.self_attention.linear_proj.register_forward_hook(
-            partial(print_output_hook, layer_idx=idx, mode='mg-o_proj_out'), with_kwargs=True)
-
-        layer.self_attention.register_forward_hook(partial(print_output_hook, layer_idx=idx, mode='mg-attn_out'),
-                                                   with_kwargs=True)
-
-    # prepare visual inputs
-    pixel_values = torch.randn(512, 1176).cuda()
-    image_grid_thw = torch.tensor([[1, 16, 32]]).cuda()
-
-    input_ids = torch.tensor([[151644,   8506,  22564,  27608,  75188,   4344, 121395,  61991,  79554, 36689]]).long().cuda()
-    position_ids, _ = hfmodel.get_rope_index(input_ids)
-
-    is_oom = False
-    with torch.inference_mode():
-        try:
-            hfmodel.cuda()
-            hf_vision_output = hfmodel.visual(pixel_values, grid_thw=image_grid_thw)
-            hf_inputs_embeds = hfmodel.model.embed_tokens(input_ids)
-            hflogits = hfmodel.lm_head(hfmodel.model(
-                input_ids=None,
-                inputs_embeds=hf_inputs_embeds,
-                position_ids=position_ids
-            )[0])
-        except torch.cuda.OutOfMemoryError:
-            print('oom for huggingface model forward')
-            is_oom = True
-        hfmodel.cpu()
-        del hfmodel
-
-    with torch.inference_mode():
-        try:
-            mgmodel.cuda()
-            mg_vision_output = mgmodel.vision_model(
-                vision_data=pixel_values,
-                grid_thw=image_grid_thw
-            )
-            language_embeddings = mgmodel.language_model.embedding(
-                input_ids=input_ids,
-                position_ids=None # NOTE: disable
-            ).clone()
-            mglogits = mgmodel.language_model(
-                input_ids=None,
-                position_ids=position_ids,    
-                decoder_input=language_embeddings, 
-                attention_mask=None
-            )
-        except torch.cuda.OutOfMemoryError:
-            print('oom for megatron model forward')
-            is_oom = True
-        mgmodel.cpu()
-        del mgmodel
-
-    epsilon = 1e-5
-    for idx, (hfh, mgh) in enumerate(zip(hf_hiddens, mg_hiddens)):
-        assert len(hfh) == len(mgh)
-        for k, hfv in hfh.items():
-            mgv, hfv = mgh[k].cpu(), hfv.cpu()
-            same_num = (hfv != mgv).sum()
-            diff_num = ((hfv - mgv) > epsilon).sum()
-            diff_max = (hfv - mgv).abs().max()
-            print(f'layer:{idx}, {k}, diff: {same_num}, diff>{epsilon}:[{diff_num}/{hfv.numel()}] diff_max:{diff_max}')
-
-    if not is_oom:
-        vis_diff_num = ((hf_vision_output - mg_vision_output) > epsilon).sum()
-        vis_diff_max = ((hf_vision_output - mg_vision_output)).abs().max()
-        diff_num = ((hflogits - mglogits) > epsilon).sum()
-        diff_max = (hflogits - mglogits).abs().max()
-        print(f'visual: diff>{epsilon}:[{vis_diff_num}/{hf_vision_output.numel()}] diff_max:{vis_diff_max}')
-        print(f'logits: diff>{epsilon}:[{diff_num}/{hflogits.numel()}] diff_max:{diff_max}')
-
 def add_extra_args(parser):
     parser = get_patch_args(parser)
     parser = add_model_args(parser)
@@ -987,7 +744,7 @@ def main():
     initialize_megatron(extra_args_provider=add_extra_args)
     args = get_args()
 
-    if False and args.convert_checkpoint_from_megatron_to_transformers:
+    if args.convert_checkpoint_from_megatron_to_transformers:
         config = AutoConfig.from_pretrained(args.hf_ckpt_path)
         hf_model = Qwen2VLForConditionalGeneration.from_pretrained(args.hf_ckpt_path, torch_dtype=config.torch_dtype)
         mg_model = load_megatron_model(args)
@@ -998,7 +755,6 @@ def main():
         hf_model = Qwen2VLForConditionalGeneration.from_pretrained(args.load, torch_dtype=config.torch_dtype)
         mg_model = model_provider()
         convert_checkpoint_from_transformers_to_megatron(hf_model, mg_model, args)
-        # check_hf_mg_forward(hf_model, mg_model, args)
         save_mgmodel(mg_model, args)
 
 if __name__ == "__main__":
diff --git a/toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl_convertor.sh b/toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl_convertor.sh
index 4b1968e..75f328d 100644
--- a/toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl_convertor.sh
+++ b/toolkits/model_checkpoints_convertor/qwen/hf2mcore_qwen2_vl_convertor.sh
@@ -1,10 +1,21 @@
 #!/bin/bash
 set -e
-export CUDA_VISIBLE_DEVICES=7
+export CUDA_VISIBLE_DEVICES=${CUDA_VISIBLE_DEVICES:-7}
 START_TIME=$SECONDS
 MASTER_ADDR=localhost
 MASTER_PORT=$(shuf -n 1 -i 10000-65535)
 
+if [ -z ${MP_SAVE_SAFE_TENSORS} ];then
+    MP_SAVE_SAFE_TENSORS=false
+fi
+
+if [ ${MP_SAVE_SAFE_TENSORS} = true ];then
+    safe_options=" \
+        --save-safetensors"
+else
+    safe_options=""
+fi
+    
 MODEL_SIZE=$1
 SOURCE_CKPT_PATH=$2
 TARGET_CKPT_PATH=$3
@@ -12,6 +23,7 @@ TP=$4
 PP=$5
 MG2HF=$6
 PR=$7
+HF_CKPT_PATH=$8
 
 CURRENT_DIR="$( cd "$( dirname "$0" )" && pwd )"
 MEGATRON_PATH=$( dirname $(dirname $( dirname ${CURRENT_DIR})))
@@ -141,8 +153,8 @@ cmd="torchrun ${DISTRIBUTED_ARGS} hf2mcore_qwen2_vl.py \
     --attention-dropout 0.0 \
     --hidden-dropout 0.0 \
     --rotary-base 1000000 \
-    --save-safetensors \
     --spatial-merge-size 2 \
+    ${safe_options} \
     ${te_options} \
     ${convert_options} \
     ${pr_options} \
diff --git a/toolkits/pretrain_data_preprocessing/convert_custom_dataset_to_wds_chatml.py b/toolkits/pretrain_data_preprocessing/convert_custom_dataset_to_wds_chatml.py
new file mode 100644
index 0000000..cb2d8d0
--- /dev/null
+++ b/toolkits/pretrain_data_preprocessing/convert_custom_dataset_to_wds_chatml.py
@@ -0,0 +1,113 @@
+import json
+import os
+import webdataset as wds
+from tqdm import tqdm
+import yaml
+from argparse import ArgumentParser
+import cv2
+from webdataset.writer import default_handlers, add_handlers, imageencoder
+import pickle
+
+from megatron.energon.epathlib import EPath
+from megatron.energon.flavors import BaseWebdatasetFactory
+
+def convert(dataset_dir, json_name, sort_function=sorted, max_count=10000):
+    """
+        Here we provide an example to convert llava-pretrain dataset to ChatMLSample
+    """
+    # Paths to the dataset files
+    json_file = os.path.join(dataset_dir, json_name)
+    output = os.path.join(dataset_dir, 'wds')
+
+    if not os.path.exists(output):
+        os.mkdir(output)
+
+    # Load data
+    with open(json_file, 'r') as f:
+        data = json.load(f)
+
+    # custom webdataset ShardWriter Encoder
+    add_handlers(default_handlers, "jpgs", lambda data: pickle.dumps([imageencoder(d, "jpg") for d in data]))
+    add_handlers(default_handlers, "videos", lambda data: pickle.dumps([[imageencoder(d, "jpg") for d in video] for video in data]))
+
+    has_idx = None
+    with wds.ShardWriter(os.path.join(output, 'pretrain-%d.tar'), maxcount=max_count) as shard_writer:
+        for idx, entry in enumerate(tqdm(data)):
+            # NOTE: read a dataset in sharegpt format
+            image_datas = []
+            for image in entry.pop('images', []):
+                image_datas.append(cv2.imread(os.path.join(dataset_dir, image), cv2.IMREAD_UNCHANGED))
+            
+            video_datas = []
+            for video in entry.pop('videos', []):
+                video_noext, _ = os.path.splitext(video)
+                frame_folder = os.path.join(dataset_dir, video_noext)
+                frames = []
+                for frame in sort_function(os.listdir(frame_folder)):
+                    frames.append(cv2.imread(os.path.join(frame_folder, frame), cv2.IMREAD_UNCHANGED))
+                video_datas.append(frames)
+
+            if has_idx is None:
+                has_idx = 'id' in entry
+            assert has_idx == ('id' in entry), "All entries should either all contain idx or not."
+            
+            sample = {
+                "__key__": entry.pop('id', str(idx)), 
+                "jpgs": image_datas,
+                'videos': video_datas,
+                "json": json.dumps(entry['conversations']).encode("utf-8"),
+            }
+            shard_writer.write(sample)
+    
+    print(f"Dataset successfully converted to wds")
+    return output
+
+
+def generate_configs(path: EPath, split, shuffle_tars=True, num_workers=32):
+    path = path.absolute()
+    all_tars = list(path.glob("**/*.tar")) + list(path.glob("**/*.tgz"))
+    all_tars = [str(p.relative_to(path)) for p in sorted(all_tars)]
+    split_parts_ratio = [("train", split[0]), ("val", split[1]), ("test", split[2])]
+    split_parts_patterns = None
+    
+    # NOTE: generate .info.yaml and split.yaml
+    _ = BaseWebdatasetFactory.prepare_dataset(
+        path,
+        all_tars,
+        split_parts_ratio=split_parts_ratio,
+        split_parts_patterns=split_parts_patterns,
+        tar_index_only=False,
+        shuffle_seed=42 if shuffle_tars else None,
+        workers=num_workers,
+    )
+
+    # NOTE: dump dataset.yaml
+    metadata = {
+        '__class__': 'ChatMLWebdataset',
+        '__module__': 'megatron_patch.data.energon.chatml',
+        'field_map': {
+            'imgs': 'jpgs',
+            'videos': 'videos',
+            'conversation': 'json'
+        }
+    }
+    with open(os.path.join(path.url, '.nv-meta', 'dataset.yaml'), 'w') as f:
+        yaml.safe_dump(metadata, f)
+    
+if __name__ == '__main__':
+    argparser = ArgumentParser()
+    argparser.add_argument('--dataset-root', required=True, type=str)
+    argparser.add_argument('--json', default='dataset.json', type=str)
+    argparser.add_argument('--max-samples-per-tar', default=10000, type=float)
+    argparser.add_argument('--train-split', default=9, type=float)
+    argparser.add_argument('--val-split', default=1, type=float)
+    argparser.add_argument('--test-split', default=0, type=float)
+    args = argparser.parse_args()
+
+
+    output_dir = convert(args.dataset_root, args.json, max_count=args.max_samples_per_tar)
+    print(f"Generating Configurations")
+    # NOTE: split_ratio: train/val/test
+    split=[args.train_split, args.val_split, args.test_split]
+    generate_configs(EPath(output_dir), split)
+    print(f"Configurations Generated")
\ No newline at end of file