64 lines
1.1 KiB
YAML
64 lines
1.1 KiB
YAML
|
# network architecture
|
||
|
model: Transducer
|
||
|
model_conf:
|
||
|
auxiliary_ctc_weight: 0.0
|
||
|
|
||
|
# encoder
|
||
|
encoder: RWKVEncoder
|
||
|
encoder_conf:
|
||
|
kernel: 3
|
||
|
subsampling_factor: 4
|
||
|
output_size: 512
|
||
|
num_blocks: 18
|
||
|
time_reduction_factor: 2
|
||
|
att_dropout_rate: 0.1
|
||
|
ffn_dropout_rate: 0.1
|
||
|
dropout_rate: 0.1
|
||
|
|
||
|
# decoder (prediction network)
|
||
|
decoder: rnnt_decoder
|
||
|
decoder_conf:
|
||
|
embed_size: 512
|
||
|
hidden_size: 512
|
||
|
embed_dropout_rate: 0.1
|
||
|
dropout_rate: 0.1
|
||
|
use_embed_mask: false
|
||
|
|
||
|
# joint network
|
||
|
joint_network: joint_network
|
||
|
joint_network_conf:
|
||
|
joint_space_size: 512
|
||
|
|
||
|
frontend: WavFrontend
|
||
|
frontend_conf:
|
||
|
fs: 16000
|
||
|
window: hamming
|
||
|
n_mels: 80
|
||
|
frame_length: 25
|
||
|
frame_shift: 10
|
||
|
lfr_m: 1
|
||
|
lfr_n: 1
|
||
|
upsacle_samples: true
|
||
|
|
||
|
specaug: SpecAugLFR
|
||
|
specaug_conf:
|
||
|
apply_time_warp: false
|
||
|
time_warp_window: 5
|
||
|
time_warp_mode: bicubic
|
||
|
apply_freq_mask: true
|
||
|
freq_mask_width_range:
|
||
|
- 0
|
||
|
- 30
|
||
|
lfr_rate: 6
|
||
|
num_freq_mask: 1
|
||
|
apply_time_mask: true
|
||
|
time_mask_width_range:
|
||
|
- 0
|
||
|
- 12
|
||
|
num_time_mask: 1
|
||
|
|
||
|
tokenizer: CharTokenizer
|
||
|
tokenizer_conf:
|
||
|
unk_symbol: <unk>
|
||
|
split_with_space: true
|
||
|
|