using System; using System.Collections.Generic; using System.Diagnostics; using System.Linq; using System.Text; using System.Threading.Tasks; using AliFsmnVadSharp.Model; namespace AliFsmnVadSharp { enum VadStateMachine { kVadInStateStartPointNotDetected = 1, kVadInStateInSpeechSegment = 2, kVadInStateEndPointDetected = 3, } enum VadDetectMode { kVadSingleUtteranceDetectMode = 0, kVadMutipleUtteranceDetectMode = 1, } internal class E2EVadModel { private VadPostConfEntity _vad_opts = new VadPostConfEntity(); private WindowDetector _windows_detector = new WindowDetector(); private bool _is_final = false; private int _data_buf_start_frame = 0; private int _frm_cnt = 0; private int _latest_confirmed_speech_frame = 0; private int _lastest_confirmed_silence_frame = -1; private int _continous_silence_frame_count = 0; private int _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected; private int _confirmed_start_frame = -1; private int _confirmed_end_frame = -1; private int _number_end_time_detected = 0; private int _sil_frame = 0; private int[] _sil_pdf_ids = new int[0]; private double _noise_average_decibel = -100.0D; private bool _pre_end_silence_detected = false; private bool _next_seg = true; private List _output_data_buf; private int _output_data_buf_offset = 0; private List _frame_probs = new List(); private int _max_end_sil_frame_cnt_thresh = 800 - 150; private float _speech_noise_thres = 0.6F; private float[,,] _scores = null; private int _idx_pre_chunk = 0; private bool _max_time_out = false; private List _decibel = new List(); private int _data_buf_size = 0; private int _data_buf_all_size = 0; public E2EVadModel(VadPostConfEntity vadPostConfEntity) { _vad_opts = vadPostConfEntity; _windows_detector = new WindowDetector(_vad_opts.window_size_ms, _vad_opts.sil_to_speech_time_thres, _vad_opts.speech_to_sil_time_thres, _vad_opts.frame_in_ms); AllResetDetection(); } private void AllResetDetection() { _is_final = false; _data_buf_start_frame = 0; _frm_cnt = 0; _latest_confirmed_speech_frame = 0; _lastest_confirmed_silence_frame = -1; _continous_silence_frame_count = 0; _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected; _confirmed_start_frame = -1; _confirmed_end_frame = -1; _number_end_time_detected = 0; _sil_frame = 0; _sil_pdf_ids = _vad_opts.sil_pdf_ids; _noise_average_decibel = -100.0F; _pre_end_silence_detected = false; _next_seg = true; _output_data_buf = new List(); _output_data_buf_offset = 0; _frame_probs = new List(); _max_end_sil_frame_cnt_thresh = _vad_opts.max_end_silence_time - _vad_opts.speech_to_sil_time_thres; _speech_noise_thres = _vad_opts.speech_noise_thres; _scores = null; _idx_pre_chunk = 0; _max_time_out = false; _decibel = new List(); _data_buf_size = 0; _data_buf_all_size = 0; ResetDetection(); } private void ResetDetection() { _continous_silence_frame_count = 0; _latest_confirmed_speech_frame = 0; _lastest_confirmed_silence_frame = -1; _confirmed_start_frame = -1; _confirmed_end_frame = -1; _vad_state_machine = (int)VadStateMachine.kVadInStateStartPointNotDetected; _windows_detector.Reset(); _sil_frame = 0; _frame_probs = new List(); } private void ComputeDecibel(float[] waveform) { int frame_sample_length = (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000); int frame_shift_length = (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000); if (_data_buf_all_size == 0) { _data_buf_all_size = waveform.Length; _data_buf_size = _data_buf_all_size; } else { _data_buf_all_size += waveform.Length; } for (int offset = 0; offset < waveform.Length - frame_sample_length + 1; offset += frame_shift_length) { float[] _waveform_chunk = new float[frame_sample_length]; Array.Copy(waveform, offset, _waveform_chunk, 0, _waveform_chunk.Length); float[] _waveform_chunk_pow = _waveform_chunk.Select(x => (float)Math.Pow((double)x, 2)).ToArray(); _decibel.Add( 10 * Math.Log10( _waveform_chunk_pow.Sum() + 0.000001 ) ); } } private void ComputeScores(float[,,] scores) { _vad_opts.nn_eval_block_size = scores.GetLength(1); _frm_cnt += scores.GetLength(1); _scores = scores; } private void PopDataBufTillFrame(int frame_idx)// need check again { while (_data_buf_start_frame < frame_idx) { if (_data_buf_size >= (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000)) { _data_buf_start_frame += 1; _data_buf_size = _data_buf_all_size - _data_buf_start_frame * (int)(_vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000); } } } private void PopDataToOutputBuf(int start_frm, int frm_cnt, bool first_frm_is_start_point, bool last_frm_is_end_point, bool end_point_is_sent_end) { PopDataBufTillFrame(start_frm); int expected_sample_number = (int)(frm_cnt * _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000); if (last_frm_is_end_point) { int extra_sample = Math.Max(0, (int)(_vad_opts.frame_length_ms * _vad_opts.sample_rate / 1000 - _vad_opts.sample_rate * _vad_opts.frame_in_ms / 1000)); expected_sample_number += (int)(extra_sample); } if (end_point_is_sent_end) { expected_sample_number = Math.Max(expected_sample_number, _data_buf_size); } if (_data_buf_size < expected_sample_number) { Console.WriteLine("error in calling pop data_buf\n"); } if (_output_data_buf.Count == 0 || first_frm_is_start_point) { _output_data_buf.Add(new E2EVadSpeechBufWithDoaEntity()); _output_data_buf.Last().Reset(); _output_data_buf.Last().start_ms = start_frm * _vad_opts.frame_in_ms; _output_data_buf.Last().end_ms = _output_data_buf.Last().start_ms; _output_data_buf.Last().doa = 0; } E2EVadSpeechBufWithDoaEntity cur_seg = _output_data_buf.Last(); if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms) { Console.WriteLine("warning\n"); } int out_pos = cur_seg.buffer.Length; // cur_seg.buff现在没做任何操作 int data_to_pop = 0; if (end_point_is_sent_end) { data_to_pop = expected_sample_number; } else { data_to_pop = (int)(frm_cnt * _vad_opts.frame_in_ms * _vad_opts.sample_rate / 1000); } if (data_to_pop > _data_buf_size) { Console.WriteLine("VAD data_to_pop is bigger than _data_buf_size!!!\n"); data_to_pop = _data_buf_size; expected_sample_number = _data_buf_size; } cur_seg.doa = 0; for (int sample_cpy_out = 0; sample_cpy_out < data_to_pop; sample_cpy_out++) { out_pos += 1; } for (int sample_cpy_out = data_to_pop; sample_cpy_out < expected_sample_number; sample_cpy_out++) { out_pos += 1; } if (cur_seg.end_ms != start_frm * _vad_opts.frame_in_ms) { Console.WriteLine("Something wrong with the VAD algorithm\n"); } _data_buf_start_frame += frm_cnt; cur_seg.end_ms = (start_frm + frm_cnt) * _vad_opts.frame_in_ms; if (first_frm_is_start_point) { cur_seg.contain_seg_start_point = true; } if (last_frm_is_end_point) { cur_seg.contain_seg_end_point = true; } } private void OnSilenceDetected(int valid_frame) { _lastest_confirmed_silence_frame = valid_frame; if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected) { PopDataBufTillFrame(valid_frame); } } private void OnVoiceDetected(int valid_frame) { _latest_confirmed_speech_frame = valid_frame; PopDataToOutputBuf(valid_frame, 1, false, false, false); } private void OnVoiceStart(int start_frame, bool fake_result = false) { if (_vad_opts.do_start_point_detection) { //do nothing } if (_confirmed_start_frame != -1) { Console.WriteLine("not reset vad properly\n"); } else { _confirmed_start_frame = start_frame; } if (!fake_result || _vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected) { PopDataToOutputBuf(_confirmed_start_frame, 1, true, false, false); } } private void OnVoiceEnd(int end_frame, bool fake_result, bool is_last_frame) { for (int t = _latest_confirmed_speech_frame + 1; t < end_frame; t++) { OnVoiceDetected(t); } if (_vad_opts.do_end_point_detection) { //do nothing } if (_confirmed_end_frame != -1) { Console.WriteLine("not reset vad properly\n"); } else { _confirmed_end_frame = end_frame; } if (!fake_result) { _sil_frame = 0; PopDataToOutputBuf(_confirmed_end_frame, 1, false, true, is_last_frame); } _number_end_time_detected += 1; } private void MaybeOnVoiceEndIfLastFrame(bool is_final_frame, int cur_frm_idx) { if (is_final_frame) { OnVoiceEnd(cur_frm_idx, false, true); _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected; } } private int GetLatency() { return (int)(LatencyFrmNumAtStartPoint() * _vad_opts.frame_in_ms); } private int LatencyFrmNumAtStartPoint() { int vad_latency = _windows_detector.GetWinSize(); if (_vad_opts.do_extend != 0) { vad_latency += (int)(_vad_opts.lookback_time_start_point / _vad_opts.frame_in_ms); } return vad_latency; } private FrameState GetFrameState(int t) { FrameState frame_state = FrameState.kFrameStateInvalid; double cur_decibel = _decibel[t]; double cur_snr = cur_decibel - _noise_average_decibel; if (cur_decibel < _vad_opts.decibel_thres) { frame_state = FrameState.kFrameStateSil; DetectOneFrame(frame_state, t, false); return frame_state; } double sum_score = 0.0D; double noise_prob = 0.0D; Trace.Assert(_sil_pdf_ids.Length == _vad_opts.silence_pdf_num, ""); if (_sil_pdf_ids.Length > 0) { Trace.Assert(_scores.GetLength(0) == 1, "只支持batch_size = 1的测试"); // 只支持batch_size = 1的测试 float[] sil_pdf_scores = new float[_sil_pdf_ids.Length]; int j = 0; foreach (int sil_pdf_id in _sil_pdf_ids) { sil_pdf_scores[j] = _scores[0,t - _idx_pre_chunk,sil_pdf_id]; j++; } sum_score = sil_pdf_scores.Length == 0 ? 0 : sil_pdf_scores.Sum(); noise_prob = Math.Log(sum_score) * _vad_opts.speech_2_noise_ratio; double total_score = 1.0D; sum_score = total_score - sum_score; } double speech_prob = Math.Log(sum_score); if (_vad_opts.output_frame_probs) { E2EVadFrameProbEntity frame_prob = new E2EVadFrameProbEntity(); frame_prob.noise_prob = noise_prob; frame_prob.speech_prob = speech_prob; frame_prob.score = sum_score; frame_prob.frame_id = t; _frame_probs.Add(frame_prob); } if (Math.Exp(speech_prob) >= Math.Exp(noise_prob) + _speech_noise_thres) { if (cur_snr >= _vad_opts.snr_thres && cur_decibel >= _vad_opts.decibel_thres) { frame_state = FrameState.kFrameStateSpeech; } else { frame_state = FrameState.kFrameStateSil; } } else { frame_state = FrameState.kFrameStateSil; if (_noise_average_decibel < -99.9) { _noise_average_decibel = cur_decibel; } else { _noise_average_decibel = (cur_decibel + _noise_average_decibel * (_vad_opts.noise_frame_num_used_for_snr - 1)) / _vad_opts.noise_frame_num_used_for_snr; } } return frame_state; } public SegmentEntity[] DefaultCall(float[,,] score, float[] waveform, bool is_final = false, int max_end_sil = 800, bool online = false ) { _max_end_sil_frame_cnt_thresh = max_end_sil - _vad_opts.speech_to_sil_time_thres; // compute decibel for each frame ComputeDecibel(waveform); ComputeScores(score); if (!is_final) { DetectCommonFrames(); } else { DetectLastFrames(); } int batchSize = score.GetLength(0); SegmentEntity[] segments = new SegmentEntity[batchSize]; for (int batch_num = 0; batch_num < batchSize; batch_num++) // only support batch_size = 1 now { List segment_batch = new List(); if (_output_data_buf.Count > 0) { for (int i = _output_data_buf_offset; i < _output_data_buf.Count; i++) { int start_ms; int end_ms; if (online) { if (!_output_data_buf[i].contain_seg_start_point) { continue; } if (!_next_seg && !_output_data_buf[i].contain_seg_end_point) { continue; } start_ms = _next_seg ? _output_data_buf[i].start_ms : -1; if (_output_data_buf[i].contain_seg_end_point) { end_ms = _output_data_buf[i].end_ms; _next_seg = true; _output_data_buf_offset += 1; } else { end_ms = -1; _next_seg = false; } } else { if (!is_final && (!_output_data_buf[i].contain_seg_start_point || !_output_data_buf[i].contain_seg_end_point)) { continue; } start_ms = _output_data_buf[i].start_ms; end_ms = _output_data_buf[i].end_ms; _output_data_buf_offset += 1; } int[] segment_ms = new int[] { start_ms, end_ms }; segment_batch.Add(segment_ms); } } if (segment_batch.Count > 0) { if (segments[batch_num] == null) { segments[batch_num] = new SegmentEntity(); } segments[batch_num].Segment.AddRange(segment_batch); } } if (is_final) { // reset class variables and clear the dict for the next query AllResetDetection(); } return segments; } private int DetectCommonFrames() { if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected) { return 0; } for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1) { FrameState frame_state = FrameState.kFrameStateInvalid; frame_state = GetFrameState(_frm_cnt - 1 - i); DetectOneFrame(frame_state, _frm_cnt - 1 - i, false); } _idx_pre_chunk += _scores.GetLength(1)* _scores.GetLength(0); //_scores.shape[1]; return 0; } private int DetectLastFrames() { if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected) { return 0; } try { for (int i = _vad_opts.nn_eval_block_size - 1; i > -1; i += -1) { FrameState frame_state = FrameState.kFrameStateInvalid; frame_state = GetFrameState(_frm_cnt - 1 - i); if (i != 0) { DetectOneFrame(frame_state, _frm_cnt - 1 - i, false); } else { DetectOneFrame(frame_state, _frm_cnt - 1, true); } } } catch (Exception e) { // } return 0; } private void DetectOneFrame(FrameState cur_frm_state, int cur_frm_idx, bool is_final_frame) { FrameState tmp_cur_frm_state = FrameState.kFrameStateInvalid; if (cur_frm_state == FrameState.kFrameStateSpeech) { if (Math.Abs(1.0) > _vad_opts.fe_prior_thres)//Fabs { tmp_cur_frm_state = FrameState.kFrameStateSpeech; } else { tmp_cur_frm_state = FrameState.kFrameStateSil; } } else if (cur_frm_state == FrameState.kFrameStateSil) { tmp_cur_frm_state = FrameState.kFrameStateSil; } AudioChangeState state_change = _windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx); int frm_shift_in_ms = _vad_opts.frame_in_ms; if (AudioChangeState.kChangeStateSil2Speech == state_change) { int silence_frame_count = _continous_silence_frame_count; // no used _continous_silence_frame_count = 0; _pre_end_silence_detected = false; int start_frame = 0; if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected) { start_frame = Math.Max(_data_buf_start_frame, cur_frm_idx - LatencyFrmNumAtStartPoint()); OnVoiceStart(start_frame); _vad_state_machine = (int)VadStateMachine.kVadInStateInSpeechSegment; for (int t = start_frame + 1; t < cur_frm_idx + 1; t++) { OnVoiceDetected(t); } } else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment) { for (int t = _latest_confirmed_speech_frame + 1; t < cur_frm_idx; t++) { OnVoiceDetected(t); } if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms) { OnVoiceEnd(cur_frm_idx, false, false); _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected; } else if (!is_final_frame) { OnVoiceDetected(cur_frm_idx); } else { MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx); } } else { return; } } else if (AudioChangeState.kChangeStateSpeech2Sil == state_change) { _continous_silence_frame_count = 0; if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected) { return; } else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment) { if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms) { OnVoiceEnd(cur_frm_idx, false, false); _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected; } else if (!is_final_frame) { OnVoiceDetected(cur_frm_idx); } else { MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx); } } else { return; } } else if (AudioChangeState.kChangeStateSpeech2Speech == state_change) { _continous_silence_frame_count = 0; if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment) { if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms) { _max_time_out = true; OnVoiceEnd(cur_frm_idx, false, false); _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected; } else if (!is_final_frame) { OnVoiceDetected(cur_frm_idx); } else { MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx); } } else { return; } } else if (AudioChangeState.kChangeStateSil2Sil == state_change) { _continous_silence_frame_count += 1; if (_vad_state_machine == (int)VadStateMachine.kVadInStateStartPointNotDetected) { // silence timeout, return zero length decision if (((_vad_opts.detect_mode == (int)VadDetectMode.kVadSingleUtteranceDetectMode) && ( _continous_silence_frame_count * frm_shift_in_ms > _vad_opts.max_start_silence_time)) || (is_final_frame && _number_end_time_detected == 0)) { for (int t = _lastest_confirmed_silence_frame + 1; t < cur_frm_idx; t++) { OnSilenceDetected(t); } OnVoiceStart(0, true); OnVoiceEnd(0, true, false); _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected; } else { if (cur_frm_idx >= LatencyFrmNumAtStartPoint()) { OnSilenceDetected(cur_frm_idx - LatencyFrmNumAtStartPoint()); } } } else if (_vad_state_machine == (int)VadStateMachine.kVadInStateInSpeechSegment) { if (_continous_silence_frame_count * frm_shift_in_ms >= _max_end_sil_frame_cnt_thresh) { int lookback_frame = (int)(_max_end_sil_frame_cnt_thresh / frm_shift_in_ms); if (_vad_opts.do_extend != 0) { lookback_frame -= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms); lookback_frame -= 1; lookback_frame = Math.Max(0, lookback_frame); } OnVoiceEnd(cur_frm_idx - lookback_frame, false, false); _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected; } else if (cur_frm_idx - _confirmed_start_frame + 1 > _vad_opts.max_single_segment_time / frm_shift_in_ms) { OnVoiceEnd(cur_frm_idx, false, false); _vad_state_machine = (int)VadStateMachine.kVadInStateEndPointDetected; } else if (_vad_opts.do_extend != 0 && !is_final_frame) { if (_continous_silence_frame_count <= (int)(_vad_opts.lookahead_time_end_point / frm_shift_in_ms)) { OnVoiceDetected(cur_frm_idx); } } else { MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx); } } else { return; } } if (_vad_state_machine == (int)VadStateMachine.kVadInStateEndPointDetected && _vad_opts.detect_mode == (int)VadDetectMode.kVadMutipleUtteranceDetectMode) { ResetDetection(); } } } }