/** * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights * Reserved. MIT License (https://opensource.org/licenses/MIT) */ /* 2022-2023 by zhaomingwork */ // client for websocket, support multiple threads // ./funasr-wss-client --server-ip // --port // --wav-path // [--thread-num ] // [--is-ssl ] [--] // [--version] [-h] // example: // ./funasr-wss-client --server-ip 127.0.0.1 --port 10095 --wav-path test.wav // --thread-num 1 --is-ssl 1 #define ASIO_STANDALONE 1 #include #include "portaudio.h" #include #include #include #include #include #include #include #include #include "util.h" #include "audio.h" #include "nlohmann/json.hpp" #include "tclap/CmdLine.h" #include "microphone.h" /** * Define a semi-cross platform helper method that waits/sleeps for a bit. */ void WaitABit() { #ifdef WIN32 Sleep(500); #else usleep(500); #endif } std::atomic wav_index(0); typedef websocketpp::config::asio_client::message_type::ptr message_ptr; typedef websocketpp::lib::shared_ptr context_ptr; using websocketpp::lib::bind; using websocketpp::lib::placeholders::_1; using websocketpp::lib::placeholders::_2; context_ptr OnTlsInit(websocketpp::connection_hdl) { context_ptr ctx = websocketpp::lib::make_shared( asio::ssl::context::sslv23); try { ctx->set_options( asio::ssl::context::default_workarounds | asio::ssl::context::no_sslv2 | asio::ssl::context::no_sslv3 | asio::ssl::context::single_dh_use); } catch (std::exception& e) { LOG(ERROR) << e.what(); } return ctx; } // template for tls or not config template class WebsocketClient { public: // typedef websocketpp::client client; // typedef websocketpp::client // wss_client; typedef websocketpp::lib::lock_guard scoped_lock; WebsocketClient(int is_ssl) : m_open(false), m_done(false) { // set up access channels to only log interesting things m_client.clear_access_channels(websocketpp::log::alevel::all); m_client.set_access_channels(websocketpp::log::alevel::connect); m_client.set_access_channels(websocketpp::log::alevel::disconnect); m_client.set_access_channels(websocketpp::log::alevel::app); // Initialize the Asio transport policy m_client.init_asio(); // Bind the handlers we are using using websocketpp::lib::bind; using websocketpp::lib::placeholders::_1; m_client.set_open_handler(bind(&WebsocketClient::on_open, this, _1)); m_client.set_close_handler(bind(&WebsocketClient::on_close, this, _1)); m_client.set_message_handler( [this](websocketpp::connection_hdl hdl, message_ptr msg) { on_message(hdl, msg); }); m_client.set_fail_handler(bind(&WebsocketClient::on_fail, this, _1)); m_client.clear_access_channels(websocketpp::log::alevel::all); } void on_message(websocketpp::connection_hdl hdl, message_ptr msg) { const std::string& payload = msg->get_payload(); switch (msg->get_opcode()) { case websocketpp::frame::opcode::text: nlohmann::json jsonresult = nlohmann::json::parse(payload); LOG(INFO) << "Thread: " << this_thread::get_id() << ",on_message = " << payload; if (jsonresult["is_final"] == true) { websocketpp::lib::error_code ec; m_client.close(hdl, websocketpp::close::status::going_away, "", ec); if (ec) { LOG(ERROR) << "Error closing connection " << ec.message(); } } } } // This method will block until the connection is complete void run(const std::string& uri, const std::vector& wav_list, const std::vector& wav_ids, int audio_fs, std::string asr_mode, std::vector chunk_size, const std::unordered_map& hws_map, bool is_record=false, int use_itn=1) { // Create a new connection to the given URI websocketpp::lib::error_code ec; typename websocketpp::client::connection_ptr con = m_client.get_connection(uri, ec); if (ec) { m_client.get_alog().write(websocketpp::log::alevel::app, "Get Connection Error: " + ec.message()); return; } // Grab a handle for this connection so we can talk to it in a thread // safe manor after the event loop starts. m_hdl = con->get_handle(); // Queue the connection. No DNS queries or network connections will be // made until the io_service event loop is run. m_client.connect(con); // Create a thread to run the ASIO io_service event loop websocketpp::lib::thread asio_thread(&websocketpp::client::run, &m_client); if(is_record){ send_rec_data(asr_mode, chunk_size, hws_map, use_itn); }else{ send_wav_data(wav_list[0], wav_ids[0], audio_fs, asr_mode, chunk_size, hws_map, use_itn); } WaitABit(); asio_thread.join(); } // The open handler will signal that we are ready to start sending data void on_open(websocketpp::connection_hdl) { m_client.get_alog().write(websocketpp::log::alevel::app, "Connection opened, starting data!"); scoped_lock guard(m_lock); m_open = true; } // The close handler will signal that we should stop sending data void on_close(websocketpp::connection_hdl) { m_client.get_alog().write(websocketpp::log::alevel::app, "Connection closed, stopping data!"); scoped_lock guard(m_lock); m_done = true; } // The fail handler will signal that we should stop sending data void on_fail(websocketpp::connection_hdl) { m_client.get_alog().write(websocketpp::log::alevel::app, "Connection failed, stopping data!"); scoped_lock guard(m_lock); m_done = true; } // send wav to server void send_wav_data(string wav_path, string wav_id, int audio_fs, std::string asr_mode, std::vector chunk_vector, const std::unordered_map& hws_map, int use_itn) { uint64_t count = 0; std::stringstream val; funasr::Audio audio(1); int32_t sampling_rate = audio_fs; std::string wav_format = "pcm"; if (funasr::IsTargetFile(wav_path.c_str(), "wav")) { if (!audio.LoadWav(wav_path.c_str(), &sampling_rate, false)) return; } else if (funasr::IsTargetFile(wav_path.c_str(), "pcm")) { if (!audio.LoadPcmwav(wav_path.c_str(), &sampling_rate, false)) return; } else { wav_format = "others"; if (!audio.LoadOthers2Char(wav_path.c_str())) return; } float* buff; int len; int flag = 0; bool wait = false; while (1) { { scoped_lock guard(m_lock); // If the connection has been closed, stop generating data if (m_done) { break; } // If the connection hasn't been opened yet wait a bit and retry if (!m_open) { wait = true; } else { break; } } if (wait) { // LOG(INFO) << "wait.." << m_open; WaitABit(); continue; } } websocketpp::lib::error_code ec; nlohmann::json jsonbegin; nlohmann::json chunk_size = nlohmann::json::array(); chunk_size.push_back(chunk_vector[0]); chunk_size.push_back(chunk_vector[1]); chunk_size.push_back(chunk_vector[2]); jsonbegin["mode"] = asr_mode; jsonbegin["chunk_size"] = chunk_size; jsonbegin["wav_name"] = wav_id; jsonbegin["wav_format"] = wav_format; jsonbegin["audio_fs"] = sampling_rate; jsonbegin["is_speaking"] = true; jsonbegin["itn"] = true; if(use_itn == 0){ jsonbegin["itn"] = false; } if(!hws_map.empty()){ LOG(INFO) << "hotwords: "; for (const auto& pair : hws_map) { LOG(INFO) << pair.first << " : " << pair.second; } nlohmann::json json_map(hws_map); std::string json_map_str = json_map.dump(); jsonbegin["hotwords"] = json_map_str; } m_client.send(m_hdl, jsonbegin.dump(), websocketpp::frame::opcode::text, ec); // fetch wav data use asr engine api if (wav_format == "pcm") { while (audio.Fetch(buff, len, flag) > 0) { short* iArray = new short[len]; for (size_t i = 0; i < len; ++i) { iArray[i] = (short)(buff[i] * 32768); } // send data to server int offset = 0; int block_size = 102400; while (offset < len) { int send_block = 0; if (offset + block_size <= len) { send_block = block_size; } else { send_block = len - offset; } m_client.send(m_hdl, iArray + offset, send_block * sizeof(short), websocketpp::frame::opcode::binary, ec); offset += send_block; } LOG(INFO) << "sended data len=" << len * sizeof(short); if (ec) { m_client.get_alog().write(websocketpp::log::alevel::app, "Send Error: " + ec.message()); break; } delete[] iArray; } } else { int offset = 0; int block_size = 204800; len = audio.GetSpeechLen(); char* others_buff = audio.GetSpeechChar(); while (offset < len) { int send_block = 0; if (offset + block_size <= len) { send_block = block_size; } else { send_block = len - offset; } m_client.send(m_hdl, others_buff + offset, send_block, websocketpp::frame::opcode::binary, ec); offset += send_block; } LOG(INFO) << "sended data len=" << len; if (ec) { m_client.get_alog().write(websocketpp::log::alevel::app, "Send Error: " + ec.message()); } } nlohmann::json jsonresult; jsonresult["is_speaking"] = false; m_client.send(m_hdl, jsonresult.dump(), websocketpp::frame::opcode::text, ec); WaitABit(); } static int RecordCallback(const void* inputBuffer, void* outputBuffer, unsigned long framesPerBuffer, const PaStreamCallbackTimeInfo* timeInfo, PaStreamCallbackFlags statusFlags, void* userData) { std::vector* buffer = static_cast*>(userData); const float* input = static_cast(inputBuffer); for (unsigned int i = 0; i < framesPerBuffer; i++) { buffer->push_back(input[i]); } return paContinue; } void send_rec_data(std::string asr_mode, std::vector chunk_vector, const std::unordered_map& hws_map, int use_itn) { // first message bool wait = false; while (1) { { scoped_lock guard(m_lock); // If the connection has been closed, stop generating data if (m_done) { break; } // If the connection hasn't been opened yet wait a bit and retry if (!m_open) { wait = true; } else { break; } } if (wait) { // LOG(INFO) << "wait.." << m_open; WaitABit(); continue; } } websocketpp::lib::error_code ec; float sample_rate = 16000; nlohmann::json jsonbegin; nlohmann::json chunk_size = nlohmann::json::array(); chunk_size.push_back(chunk_vector[0]); chunk_size.push_back(chunk_vector[1]); chunk_size.push_back(chunk_vector[2]); jsonbegin["mode"] = asr_mode; jsonbegin["chunk_size"] = chunk_size; jsonbegin["wav_name"] = "record"; jsonbegin["wav_format"] = "pcm"; jsonbegin["audio_fs"] = sample_rate; jsonbegin["is_speaking"] = true; jsonbegin["itn"] = true; if(use_itn == 0){ jsonbegin["itn"] = false; } if(!hws_map.empty()){ LOG(INFO) << "hotwords: "; for (const auto& pair : hws_map) { LOG(INFO) << pair.first << " : " << pair.second; } nlohmann::json json_map(hws_map); std::string json_map_str = json_map.dump(); jsonbegin["hotwords"] = json_map_str; } m_client.send(m_hdl, jsonbegin.dump(), websocketpp::frame::opcode::text, ec); // mic Microphone mic; PaDeviceIndex num_devices = Pa_GetDeviceCount(); LOG(INFO) << "Num devices: " << num_devices; PaStreamParameters param; param.device = Pa_GetDefaultInputDevice(); if (param.device == paNoDevice) { LOG(INFO) << "No default input device found"; exit(EXIT_FAILURE); } LOG(INFO) << "Use default device: " << param.device; const PaDeviceInfo *info = Pa_GetDeviceInfo(param.device); LOG(INFO) << " Name: " << info->name; LOG(INFO) << " Max input channels: " << info->maxInputChannels; param.channelCount = 1; param.sampleFormat = paFloat32; param.suggestedLatency = info->defaultLowInputLatency; param.hostApiSpecificStreamInfo = nullptr; PaStream *stream; std::vector buffer; PaError err = Pa_OpenStream(&stream, ¶m, nullptr, /* &outputParameters, */ sample_rate, 0, // frames per buffer paClipOff, // we won't output out of range samples // so don't bother clipping them RecordCallback, &buffer); if (err != paNoError) { LOG(ERROR) << "portaudio error: " << Pa_GetErrorText(err); exit(EXIT_FAILURE); } err = Pa_StartStream(stream); LOG(INFO) << "Started: "; if (err != paNoError) { LOG(ERROR) << "portaudio error: " << Pa_GetErrorText(err); exit(EXIT_FAILURE); } while(true){ int len = buffer.size(); short* iArray = new short[len]; for (size_t i = 0; i < len; ++i) { iArray[i] = (short)(buffer[i] * 32768); } m_client.send(m_hdl, iArray, len * sizeof(short), websocketpp::frame::opcode::binary, ec); buffer.clear(); if (ec) { m_client.get_alog().write(websocketpp::log::alevel::app, "Send Error: " + ec.message()); } Pa_Sleep(20); // sleep for 20ms } nlohmann::json jsonresult; jsonresult["is_speaking"] = false; m_client.send(m_hdl, jsonresult.dump(), websocketpp::frame::opcode::text, ec); err = Pa_CloseStream(stream); if (err != paNoError) { LOG(INFO) << "portaudio error: " << Pa_GetErrorText(err); exit(EXIT_FAILURE); } } websocketpp::client m_client; private: websocketpp::connection_hdl m_hdl; websocketpp::lib::mutex m_lock; bool m_open; bool m_done; int total_num = 0; }; int main(int argc, char* argv[]) { #ifdef _WIN32 #include SetConsoleOutputCP(65001); #endif google::InitGoogleLogging(argv[0]); FLAGS_logtostderr = true; TCLAP::CmdLine cmd("funasr-wss-client-2pass", ' ', "1.0"); TCLAP::ValueArg server_ip_("", "server-ip", "server-ip", true, "127.0.0.1", "string"); TCLAP::ValueArg port_("", "port", "port", true, "10095", "string"); TCLAP::ValueArg wav_path_( "", "wav-path", "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: " "asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", false, "", "string"); TCLAP::ValueArg audio_fs_("", "audio-fs", "the sample rate of audio", false, 16000, "int32_t"); TCLAP::ValueArg record_( "", "record", "record is 1 means use record", false, 0, "int"); TCLAP::ValueArg asr_mode_("", ASR_MODE, "offline, online, 2pass", false, "2pass", "string"); TCLAP::ValueArg chunk_size_("", "chunk-size", "chunk_size: 5-10-5 or 5-12-5", false, "5-10-5", "string"); TCLAP::ValueArg thread_num_("", "thread-num", "thread-num", false, 1, "int"); TCLAP::ValueArg is_ssl_( "", "is-ssl", "is-ssl is 1 means use wss connection, or use ws connection", false, 1, "int"); TCLAP::ValueArg use_itn_( "", "use-itn", "use-itn is 1 means use itn, 0 means not use itn", false, 1, "int"); TCLAP::ValueArg hotword_("", HOTWORD, "the hotword file, one hotword perline, Format: Hotword Weight (could be: 阿里巴巴 20)", false, "", "string"); cmd.add(server_ip_); cmd.add(port_); cmd.add(wav_path_); cmd.add(audio_fs_); cmd.add(asr_mode_); cmd.add(record_); cmd.add(chunk_size_); cmd.add(thread_num_); cmd.add(is_ssl_); cmd.add(use_itn_); cmd.add(hotword_); cmd.parse(argc, argv); std::string server_ip = server_ip_.getValue(); std::string port = port_.getValue(); std::string wav_path = wav_path_.getValue(); std::string asr_mode = asr_mode_.getValue(); std::string chunk_size_str = chunk_size_.getValue(); int use_itn = use_itn_.getValue(); // get chunk_size std::vector chunk_size; std::stringstream ss(chunk_size_str); std::string item; while (std::getline(ss, item, '-')) { try { chunk_size.push_back(stoi(item)); } catch (const invalid_argument&) { LOG(ERROR) << "Invalid argument: " << item; exit(-1); } } int threads_num = thread_num_.getValue(); int is_ssl = is_ssl_.getValue(); int is_record = record_.getValue(); std::string uri = ""; if (is_ssl == 1) { uri = "wss://" + server_ip + ":" + port; } else { uri = "ws://" + server_ip + ":" + port; } // hotwords std::string hotword_path = hotword_.getValue(); unordered_map hws_map; if(!hotword_path.empty()){ LOG(INFO) << "hotword path: " << hotword_path; funasr::ExtractHws(hotword_path, hws_map); } int audio_fs = audio_fs_.getValue(); if(is_record == 1){ std::vector tmp_wav_list; std::vector tmp_wav_ids; if (is_ssl == 1) { WebsocketClient c(is_ssl); c.m_client.set_tls_init_handler(bind(&OnTlsInit, ::_1)); c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn); } else { WebsocketClient c(is_ssl); c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, true, use_itn); } }else{ // read wav_path std::vector wav_list; std::vector wav_ids; string default_id = "wav_default_id"; if (funasr::IsTargetFile(wav_path, "scp")) { ifstream in(wav_path); if (!in.is_open()) { printf("Failed to open scp file"); return 0; } string line; while (getline(in, line)) { istringstream iss(line); string column1, column2; iss >> column1 >> column2; wav_list.emplace_back(column2); wav_ids.emplace_back(column1); } in.close(); } else { wav_list.emplace_back(wav_path); wav_ids.emplace_back(default_id); } for (size_t wav_i = 0; wav_i < wav_list.size(); wav_i = wav_i + threads_num) { std::vector client_threads; for (size_t i = 0; i < threads_num; i++) { if (wav_i + i >= wav_list.size()) { break; } std::vector tmp_wav_list; std::vector tmp_wav_ids; tmp_wav_list.emplace_back(wav_list[wav_i + i]); tmp_wav_ids.emplace_back(wav_ids[wav_i + i]); client_threads.emplace_back( [uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, is_ssl, hws_map, use_itn]() { if (is_ssl == 1) { WebsocketClient c(is_ssl); c.m_client.set_tls_init_handler(bind(&OnTlsInit, ::_1)); c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn); } else { WebsocketClient c(is_ssl); c.run(uri, tmp_wav_list, tmp_wav_ids, audio_fs, asr_mode, chunk_size, hws_map, false, use_itn); } }); } for (auto& t : client_threads) { t.join(); } } } }