From fac1f0caa1db36d3648526a9c5380cdb9a0b8211 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xingchen=20Song=28=E5=AE=8B=E6=98=9F=E8=BE=B0=29?=
 <xingchensong1996@163.com>
Date: Wed, 13 Dec 2023 12:58:28 +0800
Subject: [PATCH] refactor(yaml): Config ctc/cmvn/tokenizer in train.yaml
 (#2205)

* refactor(yaml): Config ctc/cmvn/tokenizer in train.yaml

* refactor(yaml): pass training

* refactor(yaml): try to pass unittest

* refactor(yaml): remove lfmmi

* refactor(yaml): nst recipe

* [refactor] refine run.sh

* [refactor] refine run.sh

* [refactor] rebase main

* [refactor] try to pass ut

* [refactor] refine librispeech in next PR

* [refactor] add todo

* [refactor] refine paraformer in next PR

* [refactor] make sos = 2

* [refactor] make sos = 2

* [refactor] try to pass ut

* [refactor] refine onnx_gpu

* [refactor] try to pass ut

* [refactor] try to pass ut

* [refactor] try to pass ut

* [refactor] try to pass ut

* refactor: pass decoding

* refactor: pass decoding

* refactor: pass decoding

* refactor: refine tokenizer

* refactor: try to pass ut
---
 .../aishell/NST/conf/train_conformer.yaml     | 25 ++++++++
 examples/aishell/NST/run_nst.sh               |  9 ---
 .../aishell/rnnt/conf/conformer_rnnt.yaml     | 24 ++++++++
 .../rnnt/conf/conformer_u2pp_rnnt.yaml        | 24 ++++++++
 .../conf/example_embedding_predictor.yaml     | 24 ++++++++
 examples/aishell/rnnt/run.sh                  | 12 +---
 examples/aishell/s0/conf/train_conformer.yaml | 25 ++++++++
 .../s0/conf/train_conformer_no_pos.yaml       | 25 ++++++++
 .../aishell/s0/conf/train_ebranchformer.yaml  | 25 ++++++++
 .../aishell/s0/conf/train_transformer.yaml    | 25 ++++++++
 .../s0/conf/train_u2++_branchformer.yaml      | 25 ++++++++
 .../aishell/s0/conf/train_u2++_conformer.yaml | 25 ++++++++
 .../s0/conf/train_u2++_conformer_1.8B.yaml    | 25 ++++++++
 .../s0/conf/train_u2++_efficonformer_v1.yaml  | 25 ++++++++
 .../train_u2++_efficonformer_v1_stream.yaml   | 25 ++++++++
 .../s0/conf/train_u2++_efficonformer_v2.yaml  | 25 ++++++++
 .../s0/conf/train_u2++_lite_conformer.yaml    | 25 ++++++++
 .../s0/conf/train_u2++_transformer.yaml       | 25 ++++++++
 .../s0/conf/train_unified_conformer.yaml      | 25 ++++++++
 .../s0/conf/train_unified_conformer_ctl.yaml  | 24 ++++++++
 .../s0/conf/train_unified_transformer.yaml    | 25 ++++++++
 examples/aishell/s0/run.sh                    | 12 +---
 test/resources/global_cmvn                    |  1 +
 test/wenet/dataset/test_processor.py          | 13 +++-
 test/wenet/text/test_paraformer_tokenizer.py  |  2 +-
 test/wenet/utils/test_init_model.py           | 25 ++++++++
 test/wenet/utils/test_init_tokenizer.py       | 24 +++++---
 tools/onnx2horizonbin.py                      | 29 +++------
 wenet/bin/alignment.py                        |  3 +-
 wenet/bin/export_onnx_gpu.py                  |  8 ++-
 wenet/bin/recognize.py                        | 18 +-----
 wenet/bin/recognize_onnx_gpu.py               |  2 +-
 wenet/bin/train.py                            |  3 +-
 ...ert_paraformer_to_wenet_config_and_ckpt.py |  7 ++-
 wenet/utils/init_model.py                     |  6 +-
 wenet/utils/init_tokenizer.py                 | 61 ++++++++++++-------
 wenet/utils/train_utils.py                    | 18 ------
 ...onvert_whisper_to_wenet_config_and_ckpt.py | 17 ++++--
 38 files changed, 613 insertions(+), 128 deletions(-)
 create mode 100644 test/resources/global_cmvn
diff --git a/examples/aishell/NST/conf/train_conformer.yaml b/examples/aishell/NST/conf/train_conformer.yaml
index 8499de2e9..221c3635b 100644
--- a/examples/aishell/NST/conf/train_conformer.yaml
+++ b/examples/aishell/NST/conf/train_conformer.yaml
@@ -28,12 +28,37 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 1200
diff --git a/examples/aishell/NST/run_nst.sh b/examples/aishell/NST/run_nst.sh
index 6a4eaf0e5..11af48624 100644
--- a/examples/aishell/NST/run_nst.sh
+++ b/examples/aishell/NST/run_nst.sh
@@ -72,7 +72,6 @@ data_type=shard
 num_utts_per_shard=1000
 train_set=train
 train_config=conf/train_conformer.yaml
-cmvn=true
 average_checkpoint=true
 target_pt=80
 decode_checkpoint=$dir/$target_pt.pt
@@ -113,9 +112,6 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   dist_backend="nccl"
   # the global_cmvn file need to be calculated by combining both supervised/unsupervised datasets,
   # and it should be positioned at data/${train_set}/global_cmvn .
-  cmvn_opts=
-  $cmvn && cp data/${train_set}/global_cmvn $dir/global_cmvn
-  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
 
   # train.py rewrite $train_config to $dir/train.yaml with model input
   # and output dimension, and $dir/train.yaml will be used for inference
@@ -133,14 +129,12 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
       --train_engine ${train_engine} \
       --config $train_config \
       --data_type $data_type \
-      --symbol_table $dict \
       --train_data data/$train_set/$data_list \
       --cv_data data/dev/data.list \
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
-      $cmvn_opts \
       --pin_memory \
       --deepspeed_config ${deepspeed_config} \
       --deepspeed.save_states ${deepspeed_save_states}
@@ -190,7 +184,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
       --beam_size 10 \
       --batch_size 1 \
       --penalty 0.0 \
-      --dict $dict \
       --ctc_weight $ctc_weight \
       --reverse_weight $reverse_weight \
       --result_file $test_dir/text \
@@ -216,7 +209,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
       --beam_size 10 \
       --batch_size 1 \
       --penalty 0.0 \
-      --dict $dict \
       --ctc_weight $ctc_weight \
       --reverse_weight $reverse_weight \
       --result_file $dev_dir/text \
@@ -275,7 +267,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     --beam_size 10 \
     --batch_size 1 \
     --penalty 0.0 \
-    --dict $dict \
     --ctc_weight $ctc_weight \
     --reverse_weight $reverse_weight \
     --result_file data/train/${dir_split}data_sublist${job_num}/${hypo_name} \
diff --git a/examples/aishell/rnnt/conf/conformer_rnnt.yaml b/examples/aishell/rnnt/conf/conformer_rnnt.yaml
index 690743760..a70284870 100644
--- a/examples/aishell/rnnt/conf/conformer_rnnt.yaml
+++ b/examples/aishell/rnnt/conf/conformer_rnnt.yaml
@@ -49,6 +49,29 @@ decoder_conf:
   self_attention_dropout_rate: 0.1
   src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid transducer+ctc+attention
 model: transducer
 model_conf:
@@ -59,6 +82,7 @@ model_conf:
     length_normalized_loss: false
     reverse_weight: 0.3
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml b/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml
index 5079ef988..b7ffb08d9 100644
--- a/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml
+++ b/examples/aishell/rnnt/conf/conformer_u2pp_rnnt.yaml
@@ -53,6 +53,29 @@ decoder_conf:
   self_attention_dropout_rate: 0.1
   src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid transducer+ctc+attention
 model: transducer
 model_conf:
@@ -63,6 +86,7 @@ model_conf:
     length_normalized_loss: false
     reverse_weight: 0.3
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/rnnt/conf/example_embedding_predictor.yaml b/examples/aishell/rnnt/conf/example_embedding_predictor.yaml
index 9a1b4ecac..b6abc3be4 100644
--- a/examples/aishell/rnnt/conf/example_embedding_predictor.yaml
+++ b/examples/aishell/rnnt/conf/example_embedding_predictor.yaml
@@ -45,6 +45,29 @@ decoder_conf:
   self_attention_dropout_rate: 0.1
   src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid transducer+ctc+attention
 model: transducer
 model_conf:
@@ -55,6 +78,7 @@ model_conf:
     length_normalized_loss: false
     reverse_weight: 0.3
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/rnnt/run.sh b/examples/aishell/rnnt/run.sh
index 9b9e7ff6a..c04718adc 100644
--- a/examples/aishell/rnnt/run.sh
+++ b/examples/aishell/rnnt/run.sh
@@ -42,7 +42,6 @@ num_utts_per_shard=1000
 
 train_set=train
 train_config=conf/conformer_u2pp_rnnt.yaml
-cmvn=true
 dir=exp/conformer_rnnt
 checkpoint=
 
@@ -92,11 +91,10 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
   mkdir -p $(dirname $dict)
   echo "<blank> 0" > ${dict}  # 0 is for "blank" in CTC
   echo "<unk> 1"  >> ${dict}  # <unk> must be 1
+  echo "<sos/eos> 2" >> $dict
   tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \
     | tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | \
-    awk '{print $0 " " NR+1}' >> ${dict}
-  num_token=$(cat $dict | wc -l)
-  echo "<sos/eos> $num_token" >> $dict
+    awk '{print $0 " " NR+2}' >> ${dict}
 fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
@@ -118,9 +116,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
   # Use "nccl" if it works, otherwise use "gloo"
   dist_backend="nccl"
-  cmvn_opts=
-  $cmvn && cp data/${train_set}/global_cmvn $dir
-  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
 
   # train.py rewrite $train_config to $dir/train.yaml with model input
   # and output dimension, and $dir/train.yaml will be used for inference
@@ -137,14 +132,12 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       --train_engine ${train_engine} \
       --config $train_config \
       --data_type $data_type \
-      --symbol_table $dict \
       --train_data data/$train_set/data.list \
       --cv_data data/dev/data.list \
       ${checkpoint:+--checkpoint $checkpoint} \
       --model_dir $dir \
       --ddp.dist_backend $dist_backend \
       --num_workers 1 \
-      $cmvn_opts \
       --pin_memory \
       --deepspeed_config ${deepspeed_config} \
       --deepspeed.save_states ${deepspeed_save_states}
@@ -183,7 +176,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     --beam_size 10 \
     --batch_size 32 \
     --penalty 0.0 \
-    --dict $dict \
     --ctc_weight $rescore_ctc_weight \
     --transducer_weight $rescore_transducer_weight \
     --attn_weight $rescore_attn_weight \
diff --git a/examples/aishell/s0/conf/train_conformer.yaml b/examples/aishell/s0/conf/train_conformer.yaml
index b8ce511cd..ef0694cb0 100644
--- a/examples/aishell/s0/conf/train_conformer.yaml
+++ b/examples/aishell/s0/conf/train_conformer.yaml
@@ -28,12 +28,37 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_conformer_no_pos.yaml b/examples/aishell/s0/conf/train_conformer_no_pos.yaml
index a2d5d03f5..c90a83904 100644
--- a/examples/aishell/s0/conf/train_conformer_no_pos.yaml
+++ b/examples/aishell/s0/conf/train_conformer_no_pos.yaml
@@ -28,12 +28,37 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_ebranchformer.yaml b/examples/aishell/s0/conf/train_ebranchformer.yaml
index edc952295..5136f1ad9 100644
--- a/examples/aishell/s0/conf/train_ebranchformer.yaml
+++ b/examples/aishell/s0/conf/train_ebranchformer.yaml
@@ -31,12 +31,37 @@ decoder_conf:
     self_attention_dropout_rate: 0.1
     src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_transformer.yaml b/examples/aishell/s0/conf/train_transformer.yaml
index b7d7eee83..ef88c7420 100644
--- a/examples/aishell/s0/conf/train_transformer.yaml
+++ b/examples/aishell/s0/conf/train_transformer.yaml
@@ -23,12 +23,37 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_u2++_branchformer.yaml b/examples/aishell/s0/conf/train_u2++_branchformer.yaml
index ef12c13a4..8702fbeb4 100644
--- a/examples/aishell/s0/conf/train_u2++_branchformer.yaml
+++ b/examples/aishell/s0/conf/train_u2++_branchformer.yaml
@@ -37,13 +37,38 @@ decoder_conf:
     self_attention_dropout_rate: 0.1
     src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
     reverse_weight: 0.3
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_u2++_conformer.yaml b/examples/aishell/s0/conf/train_u2++_conformer.yaml
index b4587bce3..fadc1c451 100644
--- a/examples/aishell/s0/conf/train_u2++_conformer.yaml
+++ b/examples/aishell/s0/conf/train_u2++_conformer.yaml
@@ -33,13 +33,38 @@ decoder_conf:
     self_attention_dropout_rate: 0.1
     src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
     reverse_weight: 0.3
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_u2++_conformer_1.8B.yaml b/examples/aishell/s0/conf/train_u2++_conformer_1.8B.yaml
index c13b4b295..d4de4c440 100644
--- a/examples/aishell/s0/conf/train_u2++_conformer_1.8B.yaml
+++ b/examples/aishell/s0/conf/train_u2++_conformer_1.8B.yaml
@@ -33,13 +33,38 @@ decoder_conf:
     self_attention_dropout_rate: 0.1
     src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
     reverse_weight: 0.3
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_u2++_efficonformer_v1.yaml b/examples/aishell/s0/conf/train_u2++_efficonformer_v1.yaml
index 3d0de82db..928227565 100644
--- a/examples/aishell/s0/conf/train_u2++_efficonformer_v1.yaml
+++ b/examples/aishell/s0/conf/train_u2++_efficonformer_v1.yaml
@@ -38,7 +38,31 @@ decoder_conf:
     self_attention_dropout_rate: 0.1
     src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
@@ -46,6 +70,7 @@ model_conf:
     reverse_weight: 0.3
 
 # dataset related
+dataset: asr
 dataset_conf:
     batch_conf:
         batch_size: 16
diff --git a/examples/aishell/s0/conf/train_u2++_efficonformer_v1_stream.yaml b/examples/aishell/s0/conf/train_u2++_efficonformer_v1_stream.yaml
index 3b5a99a86..65af5845d 100644
--- a/examples/aishell/s0/conf/train_u2++_efficonformer_v1_stream.yaml
+++ b/examples/aishell/s0/conf/train_u2++_efficonformer_v1_stream.yaml
@@ -38,7 +38,31 @@ decoder_conf:
     self_attention_dropout_rate: 0.1
     src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
@@ -46,6 +70,7 @@ model_conf:
     reverse_weight: 0.3
 
 # dataset related
+dataset: asr
 dataset_conf:
     batch_conf:
         batch_size: 16
diff --git a/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml b/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml
index c23e1b64d..e14f3ba07 100644
--- a/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml
+++ b/examples/aishell/s0/conf/train_u2++_efficonformer_v2.yaml
@@ -38,7 +38,31 @@ decoder_conf:
     self_attention_dropout_rate: 0.1
     src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
@@ -46,6 +70,7 @@ model_conf:
     reverse_weight: 0.3
 
 # dataset related
+dataset: asr
 dataset_conf:
     batch_conf:
         batch_size: 16
diff --git a/examples/aishell/s0/conf/train_u2++_lite_conformer.yaml b/examples/aishell/s0/conf/train_u2++_lite_conformer.yaml
index 1eb280de2..0ffc064f2 100644
--- a/examples/aishell/s0/conf/train_u2++_lite_conformer.yaml
+++ b/examples/aishell/s0/conf/train_u2++_lite_conformer.yaml
@@ -33,7 +33,31 @@ decoder_conf:
     self_attention_dropout_rate: 0.1
     src_attention_dropout_rate: 0.1
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
@@ -41,6 +65,7 @@ model_conf:
     reverse_weight: 0.3
     apply_non_blank_embedding: true # warning: had better use a well trained model as init model
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_u2++_transformer.yaml b/examples/aishell/s0/conf/train_u2++_transformer.yaml
index 44b4d4be7..47c822637 100644
--- a/examples/aishell/s0/conf/train_u2++_transformer.yaml
+++ b/examples/aishell/s0/conf/train_u2++_transformer.yaml
@@ -26,13 +26,38 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
     reverse_weight: 0.3
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_unified_conformer.yaml b/examples/aishell/s0/conf/train_unified_conformer.yaml
index 978d3d91c..27a060e97 100644
--- a/examples/aishell/s0/conf/train_unified_conformer.yaml
+++ b/examples/aishell/s0/conf/train_unified_conformer.yaml
@@ -32,12 +32,37 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_unified_conformer_ctl.yaml b/examples/aishell/s0/conf/train_unified_conformer_ctl.yaml
index 8cf2b726d..ad49c28c5 100644
--- a/examples/aishell/s0/conf/train_unified_conformer_ctl.yaml
+++ b/examples/aishell/s0/conf/train_unified_conformer_ctl.yaml
@@ -32,6 +32,29 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
 model: ctl_model
 model_conf:
@@ -42,6 +65,7 @@ model_conf:
     logit_temp: 0.4
     n_negatives: 100
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/conf/train_unified_transformer.yaml b/examples/aishell/s0/conf/train_unified_transformer.yaml
index 9d7a38687..58506e50f 100644
--- a/examples/aishell/s0/conf/train_unified_transformer.yaml
+++ b/examples/aishell/s0/conf/train_unified_transformer.yaml
@@ -25,12 +25,37 @@ decoder_conf:
     self_attention_dropout_rate: 0.0
     src_attention_dropout_rate: 0.0
 
+tokenizer: char
+tokenizer_conf:
+  symbol_table_path: 'data/dict/lang_char.txt'
+  split_with_space: false
+  bpe_path: null
+  non_lang_syms_path: null
+  is_multilingual: false
+  num_languages: 1
+  special_tokens:
+    <blank>: 0
+    <unk>: 1
+    <sos>: 2
+    <eos>: 2
+
+ctc: ctc
+ctc_conf:
+  ctc_blank_id: 0
+
+cmvn: global_cmvn
+cmvn_conf:
+  cmvn_file: 'data/train/global_cmvn'
+  is_json_cmvn: true
+
 # hybrid CTC/attention
+model: asr_model
 model_conf:
     ctc_weight: 0.3
     lsm_weight: 0.1     # label smoothing option
     length_normalized_loss: false
 
+dataset: asr
 dataset_conf:
     filter_conf:
         max_length: 40960
diff --git a/examples/aishell/s0/run.sh b/examples/aishell/s0/run.sh
index a94ea7b27..44decb368 100644
--- a/examples/aishell/s0/run.sh
+++ b/examples/aishell/s0/run.sh
@@ -51,7 +51,6 @@ train_set=train
 #    trained model, and freeze encoder module, otherwise there will be a
 #    autograd error
 train_config=conf/train_conformer.yaml
-cmvn=true
 dir=exp/conformer
 tensorboard_dir=tensorboard
 checkpoint=
@@ -104,11 +103,10 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
   mkdir -p $(dirname $dict)
   echo "<blank> 0" > ${dict}  # 0 is for "blank" in CTC
   echo "<unk> 1"  >> ${dict}  # <unk> must be 1
+  echo "<sos/eos> 2" >> $dict
   tools/text2token.py -s 1 -n 1 data/train/text | cut -f 2- -d" " \
     | tr " " "\n" | sort | uniq | grep -a -v -e '^\s*$' | \
-    awk '{print $0 " " NR+1}' >> ${dict}
-  num_token=$(cat $dict | wc -l)
-  echo "<sos/eos> $num_token" >> $dict
+    awk '{print $0 " " NR+2}' >> ${dict}
 fi
 
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
@@ -132,9 +130,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
   # NOTE(xcsong): deepspeed fails with gloo, see
   #   https://github.com/microsoft/DeepSpeed/issues/2818
   dist_backend="nccl"
-  cmvn_opts=
-  $cmvn && cp data/${train_set}/global_cmvn $dir
-  $cmvn && cmvn_opts="--cmvn ${dir}/global_cmvn"
 
   # train.py rewrite $train_config to $dir/train.yaml with model input
   # and output dimension, and $dir/train.yaml will be used for inference
@@ -168,7 +163,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       --train_engine ${train_engine} \
       --config $train_config \
       --data_type  $data_type \
-      --symbol_table  ${dict} \
       --train_data data/$train_set/data.list \
       --cv_data data/dev/data.list \
       ${checkpoint:+--checkpoint $checkpoint} \
@@ -177,7 +171,6 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
       --ddp.dist_backend $dist_backend \
       --num_workers ${num_workers} \
       --prefetch ${prefetch} \
-      $cmvn_opts \
       --pin_memory \
       --deepspeed_config ${deepspeed_config} \
       --deepspeed.save_states ${deepspeed_save_states}
@@ -209,7 +202,6 @@ if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
     --beam_size 10 \
     --batch_size 32 \
     --penalty 0.0 \
-    --dict $dict \
     --ctc_weight $ctc_weight \
     --reverse_weight $reverse_weight \
     --result_dir $dir \
diff --git a/test/resources/global_cmvn b/test/resources/global_cmvn
new file mode 100644
index 000000000..bf739e005
--- /dev/null
+++ b/test/resources/global_cmvn
@@ -0,0 +1 @@
+{"mean_stat": [533748832.0, 537379392.0, 553560256.0, 587163840.0, 631869248.0, 662597952.0, 684377536.0, 695391808.0, 692469312.0, 679435968.0, 666124288.0, 656322880.0, 665750272.0, 678694016.0, 681919872.0, 679622592.0, 669891392.0, 656596352.0, 653837696.0, 637679360.0, 628411776.0, 644834944.0, 638840384.0, 646180416.0, 639723136.0, 642757056.0, 637471488.0, 642369216.0, 643415296.0, 647383872.0, 649347712.0, 649295168.0, 650232704.0, 654485632.0, 660473088.0, 667416512.0, 673157760.0, 675672960.0, 675124608.0, 668018432.0, 670061312.0, 662625152.0, 663145152.0, 662503872.0, 666412736.0, 672262144.0, 678483264.0, 685387712.0, 692571648.0, 699067008.0, 700786048.0, 701201856.0, 702666816.0, 705442496.0, 706070272.0, 705989312.0, 702844352.0, 699318592.0, 696090432.0, 687558528.0, 675279680.0, 663676480.0, 662962688.0, 664300608.0, 666095936.0, 671681664.0, 676652800.0, 680098048.0, 683809344.0, 688702016.0, 692081536.0, 695787328.0, 701086080.0, 706389504.0, 711492544.0, 717638656.0, 719691584.0, 715811904.0, 696363712.0, 604650304.0], "var_stat": [5413303296.0, 5559859712.0, 6150998016.0, 6921247232.0, 7999772672.0, 8789866496.0, 9405788160.0, 9768055808.0, 9759781888.0, 9430648832.0, 9090540544.0, 8873153536.0, 9155926016.0, 9542536192.0, 9653545984.0, 9593427968.0, 9316633600.0, 8959290368.0, 8863544320.0, 8450601472.0, 8211594240.0, 8587095040.0, 8432593920.0, 8583958528.0, 8401722368.0, 8439351296.0, 8293776896.0, 8401498112.0, 8427502592.0, 8525175296.0, 8577086464.0, 8575106560.0, 8594980864.0, 8701697024.0, 8854953984.0, 9029480448.0, 9168768000.0, 9221465088.0, 9194510336.0, 8997099520.0, 9024593920.0, 8819391488.0, 8807892992.0, 8777240576.0, 8869681152.0, 9017400320.0, 9173416960.0, 9345583104.0, 9530641408.0, 9701219328.0, 9749004288.0, 9762750464.0, 9802030080.0, 9874440192.0, 9883308032.0, 9873497088.0, 9780661248.0, 9672590336.0, 9569466368.0, 9321841664.0, 8968135680.0, 8646355968.0, 8616979456.0, 8648616960.0, 8702088192.0, 8859219968.0, 8999392256.0, 9105938432.0, 9220412416.0, 9358609408.0, 9451415552.0, 9552719872.0, 9695440896.0, 9836685312.0, 9970956288.0, 10135898112.0, 10189419520.0, 10070478848.0, 9532954624.0, 7261243904.0], "frame_num": 54068199}
\ No newline at end of file
diff --git a/test/wenet/dataset/test_processor.py b/test/wenet/dataset/test_processor.py
index abe6adeba..c48a17729 100644
--- a/test/wenet/dataset/test_processor.py
+++ b/test/wenet/dataset/test_processor.py
@@ -1,6 +1,6 @@
 import pytest
 
-import wenet.dataset.processor as processor
+from wenet.dataset import processor
 from wenet.utils.init_tokenizer import init_tokenizer
 
 
@@ -29,6 +29,11 @@ def test_tokenize(symbol_table_path):
     }, {
         "txt": "It's okay"
     }]
+    configs = {}
+    configs['tokenizer_conf'] = {}
+    configs['tokenizer_conf']['symbol_table_path'] = symbol_table_path
+    configs['tokenizer_conf']['non_lang_syms_path'] = None
+    configs['tokenizer_conf']['split_with_space'] = False
     if symbol_table_path == "test/resources/librispeech.words.txt":
         bpe_model = "test/resources/librispeech.train_960_unigram5000.bpemodel"
         refs = [{
@@ -67,6 +72,8 @@ def test_tokenize(symbol_table_path):
             "tokens": ['▁IT', "'", 'S', '▁O', 'KA', 'Y'],
             "label": [2344, 2, 3790, 3010, 2418, 4979]
         }]
+        configs['tokenizer'] = 'bpe'
+        configs['tokenizer_conf']['bpe_path'] = bpe_model
     else:
         bpe_model = None
         refs = [{
@@ -138,9 +145,9 @@ def test_tokenize(symbol_table_path):
             "tokens": ['I', 't', "'", 's', '▁', 'o', 'k', 'a', 'y'],
             "label": [24, 46, 2, 43, 1, 35, 27, 7, 56]
         }]
+        configs['tokenizer'] = 'char'
 
-    configs = {'split_with_space': False}
-    tokenizer = init_tokenizer(configs, symbol_table_path, bpe_model)
+    tokenizer = init_tokenizer(configs)
     outs = processor.tokenize(txts, tokenizer)
     for (hyp, ref) in zip(outs, refs):
         assert (len(hyp["tokens"]) == len(ref["tokens"]))
diff --git a/test/wenet/text/test_paraformer_tokenizer.py b/test/wenet/text/test_paraformer_tokenizer.py
index 5101fe693..b581955be 100644
--- a/test/wenet/text/test_paraformer_tokenizer.py
+++ b/test/wenet/text/test_paraformer_tokenizer.py
@@ -28,7 +28,7 @@ def paraformer_tokenizer(request):
     configs['tokenizer_conf']['symbol_table_path'] = wenet_units
     configs['tokenizer_conf']['seg_dict_path'] = os.path.join(
         download_root, seg_dict)
-    return init_tokenizer(configs, wenet_units)
+    return init_tokenizer(configs)
 
 
 def test_tokenize(paraformer_tokenizer):
diff --git a/test/wenet/utils/test_init_model.py b/test/wenet/utils/test_init_model.py
index f73e181ec..e16ec7455 100644
--- a/test/wenet/utils/test_init_model.py
+++ b/test/wenet/utils/test_init_model.py
@@ -30,5 +30,30 @@ def test_init_model():
         config['input_dim'] = input_dim
         # TODO(xcsong): fix vocab_size
         config['output_dim'] = 3000
+        if config.get('cmvn', None) == "global_cmvn":
+            config['cmvn_conf']['cmvn_file'] = "test/resources/global_cmvn"
+        if 'tokenizer' in config:
+            if config['tokenizer'] == "char":
+                config['tokenizer_conf'][
+                    'symbol_table_path'] = "test/resources/aishell2.words.txt"
+            elif config['tokenizer'] == "bpe":
+                config['tokenizer_conf']['bpe_path'] = \
+                    "test/resources/librispeech.train_960_unigram5000.bpemodel"
+                config['tokenizer_conf']['symbol_table_path'] = \
+                    "test/resources/librispeech.words.txt"
+                config['tokenizer_conf']['non_lang_syms_path'] = \
+                    "test/resources/non-linguistic-symbols.invalid"
+            elif config['tokenizer'] == "whisper":
+                config['tokenizer_conf']['is_multilingual'] = True
+                config['tokenizer_conf']['num_languages'] = 100
+            else:
+                raise NotImplementedError
+        else:
+            config['tokenizer'] = "char"
+            config['tokenizer_conf'] = {}
+            config['tokenizer_conf']['symbol_table_path'] = \
+                "test/resources/aishell2.words.txt"
+            config['tokenizer_conf']['non_lang_syms_path'] = \
+                "test/resources/non-linguistic-symbols.invalid"
         print("checking {} {}".format(c, config))
         init_model(args, config)
diff --git a/test/wenet/utils/test_init_tokenizer.py b/test/wenet/utils/test_init_tokenizer.py
index 1b5b4af94..feef99802 100644
--- a/test/wenet/utils/test_init_tokenizer.py
+++ b/test/wenet/utils/test_init_tokenizer.py
@@ -6,12 +6,13 @@
 def test_init_whisper_tokenizer():
     # TODO(Mddct): add configs generator
     configs = {}
-    configs['whisper'] = True
-    configs['whisper_conf'] = {}
-    configs['whisper_conf']['is_multilingual'] = False
-    configs['whisper_conf']['num_languages'] = 99
+    configs['tokenizer'] = 'whisper'
+    configs['tokenizer_conf'] = {}
+    configs['tokenizer_conf']['symbol_table_path'] = None
+    configs['tokenizer_conf']['is_multilingual'] = False
+    configs['tokenizer_conf']['num_languages'] = 99
 
-    tokenizer = init_tokenizer(configs, None)
+    tokenizer = init_tokenizer(configs)
     text = "whisper powered by wenet, it's great"
 
     assert text == tokenizer.tokens2text(tokenizer.text2tokens(text))
@@ -22,7 +23,11 @@ def test_init_whisper_tokenizer():
 ])
 def test_init_char_tokenizer(symbol_table_path):
     configs = {}
-    tokenizer = init_tokenizer(configs, symbol_table_path)
+    configs['tokenizer'] = 'char'
+    configs['tokenizer_conf'] = {}
+    configs['tokenizer_conf']['symbol_table_path'] = symbol_table_path
+    configs['tokenizer_conf']['non_lang_syms_path'] = None
+    tokenizer = init_tokenizer(configs)
 
     text = "大家都好帅"
     assert text == tokenizer.tokens2text(tokenizer.text2tokens(text))
@@ -35,7 +40,12 @@ def test_init_char_tokenizer(symbol_table_path):
 def test_init_bpe_tokenizer(symbol_table_path, bpe_model):
 
     configs = {}
-    tokenizer = init_tokenizer(configs, symbol_table_path, bpe_model)
+    configs['tokenizer'] = 'bpe'
+    configs['tokenizer_conf'] = {}
+    configs['tokenizer_conf']['bpe_path'] = bpe_model
+    configs['tokenizer_conf']['symbol_table_path'] = symbol_table_path
+    configs['tokenizer_conf']['non_lang_syms_path'] = None
+    tokenizer = init_tokenizer(configs)
     text = "WENET IT'S GREAT"
 
     assert text == tokenizer.tokens2text(tokenizer.text2tokens(text))
diff --git a/tools/onnx2horizonbin.py b/tools/onnx2horizonbin.py
index 3f6474572..96bc4061c 100755
--- a/tools/onnx2horizonbin.py
+++ b/tools/onnx2horizonbin.py
@@ -49,7 +49,6 @@
 from wenet.utils.common import remove_duplicates_and_blank
 from wenet.dataset.dataset import Dataset
 from wenet.utils.checkpoint import load_checkpoint
-from wenet.utils.file_utils import read_symbol_table
 from wenet.utils.init_model import init_model
 from wenet.utils.init_tokenizer import init_tokenizer
 from wenet.bin.export_onnx_cpu import to_numpy
@@ -76,10 +75,9 @@ def save_data(tensor, dirs, prefix):
     data.tofile(dirs + "/" + prefix + ".bin")
 
 
-def make_calibration_data(enc, args, conf):
+def make_calibration_data(enc, args, conf, tokenizer):
     conf['shuffle'] = True
     logger.info(conf)
-    tokenizer = init_tokenizer(ali_conf, args.symbol_table, args.bpe_model)
     dataset = Dataset("shard",
                       args.cali_datalist,
                       tokenizer,
@@ -151,16 +149,15 @@ def make_calibration_data(enc, args, conf):
                       prefix + "." + str(i))
 
 
-def check_wer(enc, ctc, args, conf):
+def check_wer(enc, ctc, args, conf, tokenizer):
     conf['shuffle'] = False
-    tokenizer = init_tokenizer(ali_conf, args.symbol_table, args.bpe_model)
     dataset = Dataset("shard",
                       args.wer_datalist,
                       tokenizer,
                       conf,
                       partition=False)
     dataloader = DataLoader(dataset, batch_size=None, num_workers=0)
-    char_dict = {v: k for k, v in args.symbol_table.items()}
+    char_dict = {v: k for k, v in tokenizer.symbol_table.items()}
     eos = len(char_dict) - 1
 
     enc_session = HB_ONNXRuntime(
@@ -375,7 +372,6 @@ def get_args():
                         default=0.5,
                         type=float,
                         help='reverse_weight in attention_rescoing')
-    parser.add_argument('--dict', type=str, required=True, help='dict file')
     parser.add_argument('--max_samples',
                         type=int,
                         required=True,
@@ -389,10 +385,6 @@ def get_args():
                         default=None,
                         help='check wer')
     parser.add_argument('--wer_text', type=str, default=None, help='check wer')
-    parser.add_argument('--bpe_model',
-                        default=None,
-                        type=str,
-                        help='bpe model for english part')
     parser.add_argument('--ln_run_on_bpu',
                         action='store_true',
                         help='layernorm running on bpu')
@@ -418,15 +410,14 @@ def get_args():
     os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
 
     with open(args.config, 'r') as fin:
-        conf = yaml.load(fin, Loader=yaml.FullLoader)
+        configs = yaml.load(fin, Loader=yaml.FullLoader)
 
-    model = init_model(conf)
+    model = init_model(args, configs)
     load_checkpoint(model, args.checkpoint)
+    tokenizer = init_tokenizer(configs)
     model.eval()
 
-    symbol_table = read_symbol_table(args.dict)
-    args.symbol_table = symbol_table
-    args.feature_size = conf['input_dim']
+    args.feature_size = configs['input_dim']
     args.output_size = model.encoder.output_size()
     args.decoding_window = (args.chunk_size - 1) * \
         model.encoder.embed.subsampling_rate + \
@@ -436,7 +427,7 @@ def get_args():
     enc, enc_session = export_encoder(model, args)
     ctc, ctc_session = export_ctc(model, args)
 
-    conf = copy.deepcopy(conf['dataset_conf'])
+    conf = copy.deepcopy(configs['dataset_conf'])
     conf['filter_conf']['max_length'] = 102400
     conf['filter_conf']['min_length'] = 0
     conf['filter_conf']['token_max_length'] = 102400
@@ -483,7 +474,7 @@ def get_args():
         generate_config(enc_session, ctc_session, args)
 
         logger.info("Stage-3: Make calibration data")
-        make_calibration_data(enc, args, conf)
+        make_calibration_data(enc, args, conf, tokenizer)
 
         output_dir = os.path.realpath(args.output_dir)
         logger.info("Stage-4: Make ctc.bin")
@@ -502,7 +493,7 @@ def get_args():
         logger.info(
             "Stage-6: Check wer between torch model and quantized onnx")
         assert args.wer_text is not None
-        check_wer(enc, ctc, args, conf)
+        check_wer(enc, ctc, args, conf, tokenizer)
         os.system(
             "python3 tools/compute-wer.py --char=1 --v=1 {} {} > {}".format(
                 args.wer_text, args.output_dir + "/torch_text",
diff --git a/wenet/bin/alignment.py b/wenet/bin/alignment.py
index 1d510a9d8..2bfda7bcb 100644
--- a/wenet/bin/alignment.py
+++ b/wenet/bin/alignment.py
@@ -201,8 +201,7 @@ def get_labformat(timestamp, subsample):
     ali_conf['batch_conf']['batch_type'] = "static"
     ali_conf['batch_conf']['batch_size'] = args.batch_size
 
-    tokenizer = init_tokenizer(ali_conf, args.dict, args.bpe_model,
-                               args.non_lang_syms)
+    tokenizer = init_tokenizer(configs)
     ali_dataset = Dataset(args.data_type,
                           args.input_file,
                           tokenizer,
diff --git a/wenet/bin/export_onnx_gpu.py b/wenet/bin/export_onnx_gpu.py
index 9832519ee..9b4620802 100644
--- a/wenet/bin/export_onnx_gpu.py
+++ b/wenet/bin/export_onnx_gpu.py
@@ -1195,7 +1195,13 @@ def export_rescoring_decoder(model, configs, args, logger, decoder_onnx_path,
     with open(args.config, "r") as fin:
         configs = yaml.load(fin, Loader=yaml.FullLoader)
     if args.cmvn_file and os.path.exists(args.cmvn_file):
-        configs["cmvn_file"] = args.cmvn_file
+        if 'cmvn' not in configs:
+            configs['cmvn'] = "global_cmvn"
+            configs['cmvn_conf'] = {}
+        else:
+            assert configs['cmvn'] == "global_cmvn"
+            assert configs['cmvn']['cmvn_conf'] is not None
+        configs['cmvn_conf']["cmvn_file"] = args.cmvn_file
     if (args.reverse_weight != -1.0
             and "reverse_weight" in configs["model_conf"]):
         configs["model_conf"]["reverse_weight"] = args.reverse_weight
diff --git a/wenet/bin/recognize.py b/wenet/bin/recognize.py
index 699ecd1df..7a7d800fa 100644
--- a/wenet/bin/recognize.py
+++ b/wenet/bin/recognize.py
@@ -44,10 +44,6 @@ def get_args():
                         default=-1,
                         help='gpu id for this rank, -1 for cpu')
     parser.add_argument('--checkpoint', required=True, help='checkpoint model')
-    parser.add_argument('--dict', required=True, help='dict file')
-    parser.add_argument(
-        "--non_lang_syms",
-        help="non-linguistic symbol file. One symbol per line.")
     parser.add_argument('--beam_size',
                         type=int,
                         default=10,
@@ -121,18 +117,10 @@ def get_args():
                         default=0.0,
                         help='''right to left weight for attention rescoring
                                 decode mode''')
-    parser.add_argument('--bpe_model',
-                        default=None,
-                        type=str,
-                        help='bpe model for english part')
     parser.add_argument('--override_config',
                         action='append',
                         default=[],
                         help="override yaml config")
-    parser.add_argument('--connect_symbol',
-                        default='',
-                        type=str,
-                        help='used to connect the output characters')
 
     parser.add_argument('--word',
                         default='',
@@ -207,8 +195,7 @@ def main():
     test_conf['batch_conf']['batch_type'] = "static"
     test_conf['batch_conf']['batch_size'] = args.batch_size
 
-    tokenizer = init_tokenizer(configs, args.dict, args.bpe_model,
-                               args.non_lang_syms)
+    tokenizer = init_tokenizer(configs)
     test_dataset = Dataset(args.data_type,
                            args.test_data,
                            tokenizer,
@@ -229,7 +216,8 @@ def main():
     context_graph = None
     if 'decoding-graph' in args.context_bias_mode:
         context_graph = ContextGraph(args.context_list_path,
-                                     tokenizer.symbol_table, args.bpe_model,
+                                     tokenizer.symbol_table,
+                                     configs['tokenizer_conf']['bpe_path'],
                                      args.context_graph_score)
 
     _, blank_id = get_blank_id(configs, tokenizer.symbol_table)
diff --git a/wenet/bin/recognize_onnx_gpu.py b/wenet/bin/recognize_onnx_gpu.py
index 5a5cea66c..3fb0d8bbb 100644
--- a/wenet/bin/recognize_onnx_gpu.py
+++ b/wenet/bin/recognize_onnx_gpu.py
@@ -139,7 +139,7 @@ def main():
     test_conf['batch_conf']['batch_type'] = "static"
     test_conf['batch_conf']['batch_size'] = args.batch_size
 
-    tokenizer = init_tokenizer(test_conf, args.dict, args.bpe_model)
+    tokenizer = init_tokenizer(configs)
     test_dataset = Dataset(args.data_type,
                            args.test_data,
                            tokenizer,
diff --git a/wenet/bin/train.py b/wenet/bin/train.py
index 2cf70a8b8..f6c3e5c4a 100644
--- a/wenet/bin/train.py
+++ b/wenet/bin/train.py
@@ -73,8 +73,7 @@ def main():
         configs = override_config(configs, args.override_config)
 
     # init tokenizer
-    tokenizer = init_tokenizer(configs, args.symbol_table, args.bpe_model,
-                               args.non_lang_syms)
+    tokenizer = init_tokenizer(configs)
 
     # Init env for ddp OR deepspeed
     world_size, local_rank, rank = init_distributed(args)
diff --git a/wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py b/wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py
index 587a3c077..9379d4690 100644
--- a/wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py
+++ b/wenet/paraformer/convert_paraformer_to_wenet_config_and_ckpt.py
@@ -269,9 +269,10 @@ def main():
                                     configs, seg_dict)
     configs['output_dim'] = vocab_size
     configs['model'] = 'paraformer'
-    configs['is_json_cmvn'] = True
-    configs['cmvn_file'] = json_cmvn_path
-    # configs['input_dim'] = 80
+    configs['cmvn'] = "global_cmvn"
+    configs['cmvn_conf'] = {}
+    configs['cmvn_conf']['is_json_cmvn'] = True
+    configs['cmvn_conf']['cmvn_file'] = json_cmvn_path
     fields_to_keep = [
         'model', 'encoder_conf', 'decoder_conf', 'predictor_conf', 'input_dim',
         'output_dim', 'cmvn_file', 'is_json_cmvn', 'model_conf', 'paraformer',
diff --git a/wenet/utils/init_model.py b/wenet/utils/init_model.py
index 5e6680fc1..4b116bb57 100644
--- a/wenet/utils/init_model.py
+++ b/wenet/utils/init_model.py
@@ -82,8 +82,10 @@
 
 def init_model(args, configs):
 
-    if configs.get('cmvn_file', None) is not None:
-        mean, istd = load_cmvn(configs['cmvn_file'], configs['is_json_cmvn'])
+    # TODO(xcsong): Forcefully read the 'cmvn' attribute.
+    if configs.get('cmvn', None) == 'global_cmvn':
+        mean, istd = load_cmvn(configs['cmvn_conf']['cmvn_file'],
+                               configs['cmvn_conf']['is_json_cmvn'])
         global_cmvn = GlobalCMVN(
             torch.from_numpy(mean).float(),
             torch.from_numpy(istd).float())
diff --git a/wenet/utils/init_tokenizer.py b/wenet/utils/init_tokenizer.py
index f16ae0493..2ce8fdd91 100644
--- a/wenet/utils/init_tokenizer.py
+++ b/wenet/utils/init_tokenizer.py
@@ -1,3 +1,20 @@
+# Copyright (c) 2023 Wenet Community. (authors: Dinghao Zhou)
+#                                     (authors: Xingchen Song)
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import logging
+
 from wenet.text.base_tokenizer import BaseTokenizer
 from wenet.text.bpe_tokenizer import BpeTokenizer
 from wenet.text.char_tokenizer import CharTokenizer
@@ -5,30 +22,32 @@
 from wenet.text.whisper_tokenizer import WhisperTokenizer
 
 
-def init_tokenizer(configs,
-                   symbol_table,
-                   bpe_model=None,
-                   non_lang_syms=None) -> BaseTokenizer:
-    if configs.get("whisper", False):
+def init_tokenizer(configs) -> BaseTokenizer:
+    # TODO(xcsong): Forcefully read the 'tokenizer' attribute.
+    tokenizer_type = configs.get("tokenizer", "char")
+    if tokenizer_type == "whisper":
         tokenizer = WhisperTokenizer(
-            multilingual=configs['whisper_conf']['is_multilingual'],
-            num_languages=configs['whisper_conf']['num_languages'])
-    elif configs.get("tokenizer", "char") == 'paraformer':
-        assert 'tokenizer' in configs
-        assert 'tokenizer_conf' in configs
-        assert symbol_table == configs['tokenizer_conf']['symbol_table_path']
-        return ParaformerTokenizer(
+            multilingual=configs['tokenizer_conf']['is_multilingual'],
+            num_languages=configs['tokenizer_conf']['num_languages'])
+    elif tokenizer_type == "char":
+        tokenizer = CharTokenizer(
+            configs['tokenizer_conf']['symbol_table_path'],
+            configs['tokenizer_conf']['non_lang_syms_path'],
+            split_with_space=configs['tokenizer_conf'].get(
+                'split_with_space', False))
+    elif tokenizer_type == "bpe":
+        tokenizer = BpeTokenizer(
+            configs['tokenizer_conf']['bpe_path'],
+            configs['tokenizer_conf']['symbol_table_path'],
+            configs['tokenizer_conf']['non_lang_syms_path'],
+            split_with_space=configs['tokenizer_conf'].get(
+                'split_with_space', False))
+    elif tokenizer_type == 'paraformer':
+        tokenizer = ParaformerTokenizer(
             symbol_table=configs['tokenizer_conf']['symbol_table_path'],
             seg_dict=configs['tokenizer_conf']['seg_dict_path'])
-    elif bpe_model is None:
-        tokenizer = CharTokenizer(symbol_table,
-                                  non_lang_syms,
-                                  split_with_space=configs.get(
-                                      'split_with_space', False))
     else:
-        tokenizer = BpeTokenizer(bpe_model,
-                                 symbol_table,
-                                 split_with_space=configs.get(
-                                     'split_with_space', False))
+        raise NotImplementedError
+    logging.info("use {} tokenizer".format(configs["tokenizer"]))
 
     return tokenizer
diff --git a/wenet/utils/train_utils.py b/wenet/utils/train_utils.py
index e8ab61197..465eb9f28 100644
--- a/wenet/utils/train_utils.py
+++ b/wenet/utils/train_utils.py
@@ -47,17 +47,6 @@ def add_model_args(parser):
     parser.add_argument('--tensorboard_dir',
                         default='tensorboard',
                         help='tensorboard log dir')
-    parser.add_argument('--cmvn', default=None, help='global cmvn file')
-    parser.add_argument('--symbol_table',
-                        required=True,
-                        help='model unit symbol table for training')
-    parser.add_argument(
-        "--non_lang_syms",
-        help="non-linguistic symbol file. One symbol per line.")
-    parser.add_argument('--bpe_model',
-                        default=None,
-                        type=str,
-                        help='bpe model for english part')
     parser.add_argument('--override_config',
                         action='append',
                         default=[],
@@ -78,10 +67,6 @@ def add_model_args(parser):
         type=lambda s: [str(mod) for mod in s.split(",") if s != ""],
         help='free module names',
     )
-    parser.add_argument('--lfmmi_dir',
-                        default='',
-                        required=False,
-                        help='LF-MMI dir')
     return parser
 
 
@@ -229,9 +214,6 @@ def check_modify_and_save_config(args, configs, symbol_table):
 
     configs['input_dim'] = input_dim
     configs['output_dim'] = configs['vocab_size']
-    configs['cmvn_file'] = args.cmvn
-    configs['is_json_cmvn'] = True
-    configs['lfmmi_dir'] = args.lfmmi_dir
 
     configs['train_engine'] = args.train_engine
     configs['use_amp'] = args.use_amp
diff --git a/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py b/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py
index 4a51fb9b5..932a0d63a 100644
--- a/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py
+++ b/wenet/whisper/convert_whisper_to_wenet_config_and_ckpt.py
@@ -114,12 +114,18 @@ def convert_to_wenet_yaml(tokenizer, dims, wenet_yaml_path: str):
     configs['ctc_conf'] = {}
     configs['ctc_conf']['ctc_blank_id'] = tokenizer.no_speech
 
+    configs['cmvn'] = None
+    configs['cmvn_conf'] = {}
+    configs['cmvn_conf']['cmvn_file'] = None
+    configs['cmvn_conf']['is_json_cmvn'] = None
+
     configs['model'] = "whisper"
     configs['model_conf'] = {}
     configs['model_conf']['ctc_weight'] = 0.3
     configs['model_conf']['lsm_weight'] = 0.1
     configs['model_conf']['length_normalized_loss'] = False
 
+    configs['dataset'] = "asr"
     configs['dataset_conf'] = {}
     configs['dataset_conf']['filter_conf'] = {}
     configs['dataset_conf']['filter_conf'][
@@ -138,7 +144,10 @@ def convert_to_wenet_yaml(tokenizer, dims, wenet_yaml_path: str):
     configs['dataset_conf']['spec_aug_conf']['num_f_mask'] = 2
     configs['dataset_conf']['spec_aug_conf']['max_t'] = 50
     configs['dataset_conf']['spec_aug_conf']['max_f'] = 10
-    configs['dataset_conf']['spec_sub'] = False
+    configs['dataset_conf']['spec_sub'] = True
+    configs['dataset_conf']['spec_sub_conf'] = {}
+    configs['dataset_conf']['spec_sub_conf']['num_t_sub'] = 3
+    configs['dataset_conf']['spec_sub_conf']['max_t'] = 30
     configs['dataset_conf']['spec_trim'] = False
     configs['dataset_conf']['shuffle'] = True
     configs['dataset_conf']['shuffle_conf'] = {}
@@ -159,16 +168,16 @@ def convert_to_wenet_yaml(tokenizer, dims, wenet_yaml_path: str):
     configs['dataset_conf']['batch_conf']['max_frames_in_batch'] = 12000
 
     configs['grad_clip'] = 5
-    configs['accum_grad'] = 1
+    configs['accum_grad'] = 4
     configs['max_epoch'] = 100
     configs['log_interval'] = 100
 
     configs['optim'] = "adam"
     configs['optim_conf'] = {}
-    configs['optim_conf']['lr'] = 0.002
+    configs['optim_conf']['lr'] = 0.0005
     configs['scheduler'] = "warmuplr"
     configs['scheduler_conf'] = {}
-    configs['scheduler_conf']['warmup_steps'] = 25000
+    configs['scheduler_conf']['warmup_steps'] = 12000
 
     with open(wenet_yaml_path, '+w') as f:
         f.write(yaml.dump(configs))